/-

## -
; ModuleID = '<stdin>'
source_filename = "cxx11_tensor_reduction_cuda-sm_35.cui"
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

%"struct.Eigen::internal::SumReducer" = type { i8 }
%"struct.Eigen::TensorEvaluator" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* }
%"class.Eigen::array" = type { [2 x i8] }
%"struct.Eigen::DSizes" = type { %"class.Eigen::array.0" }
%"class.Eigen::array.1" = type { [2 x i32] }
%"class.Eigen::array.2" = type { [1 x %"struct.Eigen::internal::TensorIntDivisor"] }
%"struct.Eigen::internal::TensorIntDivisor" = type { i32, i32, i32 }
%"class.Eigen::array.0" = type { [1 x i32] }
%"struct.Eigen::TensorEvaluator.3" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::DSizes.4" = type { %"class.Eigen::array.1" }
%"struct.Eigen::GpuDevice" = type { %"class.Eigen::StreamInterface"* }
%"class.Eigen::StreamInterface" = type { i32 (...)** }
%"struct.Eigen::TensorEvaluator.5" = type { %"struct.Eigen::TensorEvaluator", %"struct.Eigen::GpuDevice"*, float* }
%"struct.Eigen::internal::PtrWrapper" = type { float* }
%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" }
%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::Identity" }
%"struct.Eigen::internal::(anonymous namespace)::Identity" = type { i8 }
%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer" = type { float }
%"struct.Eigen::TensorEvaluator.6" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.8" }
%"struct.Eigen::TensorEvaluator.7" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.8" = type { %"struct.Eigen::TensorEvaluator", %"class.Eigen::TensorReductionOp", %"struct.Eigen::GpuDevice"*, float* }
%"class.Eigen::TensorReductionOp" = type <{ %"class.Eigen::TensorMap"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }>
%"class.Eigen::TensorMap" = type { float*, %"struct.Eigen::DSizes.4" }
%"struct.Eigen::TensorEvaluator.11" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator" }
%"struct.Eigen::TensorEvaluator.12" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.13" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.14" = type { %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::GpuDevice"*, float* }
%"struct.Eigen::TensorEvaluator.15" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.17" }
%"struct.Eigen::TensorEvaluator.16" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.17" = type { %"struct.Eigen::TensorEvaluator.12", %"class.Eigen::TensorReductionOp.18", %"struct.Eigen::GpuDevice"*, float* }
%"class.Eigen::TensorReductionOp.18" = type <{ %"class.Eigen::TensorMap.20"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }>
%"class.Eigen::TensorMap.20" = type { float*, %"struct.Eigen::DSizes.4" }
%"struct.Eigen::TensorEvaluator.24" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.12" }

$_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_ = comdat any

$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any

$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any

$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any

$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any

$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any

$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any

$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any

$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any

$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any

$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any

@.str = private unnamed_addr constant [24 x i8] c"blockDim.x == BLOCK_DIM\00", align 1
@.str.1 = private unnamed_addr constant [76 x i8] c"third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@.str.2 = private unnamed_addr constant [16 x i8] c"blockDim.y == 1\00", align 1
@.str.3 = private unnamed_addr constant [16 x i8] c"blockDim.z == 1\00", align 1
@.str.4 = private unnamed_addr constant [22 x i8] c"gridDim.x == GRID_DIM\00", align 1
@.str.5 = private unnamed_addr constant [15 x i8] c"gridDim.y == 1\00", align 1
@.str.6 = private unnamed_addr constant [15 x i8] c"gridDim.z == 1\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_(float, i32, float*) #0 comdat {
  %4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %6 = mul nuw nsw i32 %5, %4
  %7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %8 = add nuw nsw i32 %6, %7
  %9 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %10 = mul nuw nsw i32 %9, %5
  %11 = icmp slt i32 %8, %1
  br i1 %11, label %.lr.ph.preheader, label %._crit_edge

.lr.ph.preheader:                                 ; preds = %3
  br label %.lr.ph

._crit_edge.loopexit:                             ; preds = %.lr.ph
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %3
  ret void

.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
  %.012 = phi i32 [ %14, %.lr.ph ], [ %8, %.lr.ph.preheader ]
  %12 = sext i32 %.012 to i64
  %13 = getelementptr inbounds float, float* %2, i64 %12
  store float %0, float* %13, align 4
  %14 = add nsw i32 %.012, %10
  %15 = icmp slt i32 %14, %1
  br i1 %15, label %.lr.ph, label %._crit_edge.loopexit
}

; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ctaid.x() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ntid.x() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.tid.x() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.nctaid.x() #1

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, float*) #2 comdat {
  %5 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %6 = shl nuw nsw i32 %5, 15
  %7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %8 = or i32 %6, %7
  %9 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %10 = icmp eq i32 %9, 1
  br i1 %10, label %11, label %15

; <label>:11:                                     ; preds = %4
  %12 = icmp eq i32 %8, 0
  br i1 %12, label %13, label %14

; <label>:13:                                     ; preds = %11
  store float 0.000000e+00, float* %3, align 4
  br label %14

; <label>:14:                                     ; preds = %13, %11
  tail call void @llvm.cuda.syncthreads()
  br label %15

; <label>:15:                                     ; preds = %14, %4
  %16 = sub nsw i32 %2, %8
  %17 = icmp sgt i32 %16, 32768
  %..i = select i1 %17, i32 32768, i32 %16
  %18 = icmp sgt i32 %16, 0
  br i1 %18, label %.lr.ph, label %.preheader.preheader

.preheader.preheader.loopexit:                    ; preds = %.epil.preheader
  %.lcssa47 = phi float [ %23, %.epil.preheader ]
  br label %.preheader.preheader

.preheader.preheader:                             ; preds = %.preheader.preheader.loopexit, %.preheader.preheader.loopexit.unr-lcssa, %15
  %.132.ph = phi float [ 0.000000e+00, %15 ], [ %.lcssa36.ph, %.preheader.preheader.loopexit.unr-lcssa ], [ %.lcssa47, %.preheader.preheader.loopexit ]
  br label %.preheader

.preheader.preheader.loopexit.unr-lcssa.loopexit: ; preds = %32
  %.lcssa49 = phi i32 [ %80, %32 ]
  %.lcssa48 = phi float [ %79, %32 ]
  br label %.preheader.preheader.loopexit.unr-lcssa

.preheader.preheader.loopexit.unr-lcssa:          ; preds = %.preheader.preheader.loopexit.unr-lcssa.loopexit, %.lr.ph
  %.lcssa36.ph = phi float [ undef, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
  %.02535.unr = phi i32 [ 0, %.lr.ph ], [ %.lcssa49, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
  %.03134.unr = phi float [ 0.000000e+00, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod, label %.preheader.preheader, label %.epil.preheader.preheader

.epil.preheader.preheader:                        ; preds = %.preheader.preheader.loopexit.unr-lcssa
  br label %.epil.preheader

.epil.preheader:                                  ; preds = %.epil.preheader.preheader, %.epil.preheader
  %.02535.epil = phi i32 [ %24, %.epil.preheader ], [ %.02535.unr, %.epil.preheader.preheader ]
  %.03134.epil = phi float [ %23, %.epil.preheader ], [ %.03134.unr, %.epil.preheader.preheader ]
  %epil.iter = phi i32 [ %epil.iter.sub, %.epil.preheader ], [ %xtraiter, %.epil.preheader.preheader ]
  %19 = add nuw nsw i32 %.02535.epil, %8
  %20 = sext i32 %19 to i64
  %21 = getelementptr inbounds float, float* %26, i64 %20
  %22 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %21, i32 4) #8
  %23 = fadd float %.03134.epil, %22
  %24 = add nuw nsw i32 %.02535.epil, 256
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %.preheader.preheader.loopexit, label %.epil.preheader, !llvm.loop !50

.lr.ph:                                           ; preds = %15
  %25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i64 0, i32 10, i32 0
  %26 = load float*, float** %25, align 8
  %27 = icmp sgt i32 %..i, 256
  %smax = select i1 %27, i32 %..i, i32 256
  %28 = add i32 %smax, -1
  %29 = lshr i32 %28, 8
  %30 = add nuw nsw i32 %29, 1
  %xtraiter = and i32 %30, 7
  %31 = icmp ult i32 %28, 1792
  br i1 %31, label %.preheader.preheader.loopexit.unr-lcssa, label %.lr.ph.new

.lr.ph.new:                                       ; preds = %.lr.ph
  %unroll_iter = sub nsw i32 %30, %xtraiter
  br label %32

; <label>:32:                                     ; preds = %32, %.lr.ph.new
  %.02535 = phi i32 [ 0, %.lr.ph.new ], [ %80, %32 ]
  %.03134 = phi float [ 0.000000e+00, %.lr.ph.new ], [ %79, %32 ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.new ], [ %niter.nsub.7, %32 ]
  %33 = add nuw nsw i32 %.02535, %8
  %34 = sext i32 %33 to i64
  %35 = getelementptr inbounds float, float* %26, i64 %34
  %36 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %35, i32 4) #8
  %37 = fadd float %.03134, %36
  %38 = or i32 %.02535, 256
  %39 = add nuw nsw i32 %38, %8
  %40 = sext i32 %39 to i64
  %41 = getelementptr inbounds float, float* %26, i64 %40
  %42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8
  %43 = fadd float %37, %42
  %44 = or i32 %.02535, 512
  %45 = add nuw nsw i32 %44, %8
  %46 = sext i32 %45 to i64
  %47 = getelementptr inbounds float, float* %26, i64 %46
  %48 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %47, i32 4) #8
  %49 = fadd float %43, %48
  %50 = or i32 %.02535, 768
  %51 = add nuw nsw i32 %50, %8
  %52 = sext i32 %51 to i64
  %53 = getelementptr inbounds float, float* %26, i64 %52
  %54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
  %55 = fadd float %49, %54
  %56 = or i32 %.02535, 1024
  %57 = add nuw nsw i32 %56, %8
  %58 = sext i32 %57 to i64
  %59 = getelementptr inbounds float, float* %26, i64 %58
  %60 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %59, i32 4) #8
  %61 = fadd float %55, %60
  %62 = or i32 %.02535, 1280
  %63 = add nuw nsw i32 %62, %8
  %64 = sext i32 %63 to i64
  %65 = getelementptr inbounds float, float* %26, i64 %64
  %66 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %65, i32 4) #8
  %67 = fadd float %61, %66
  %68 = or i32 %.02535, 1536
  %69 = add nuw nsw i32 %68, %8
  %70 = sext i32 %69 to i64
  %71 = getelementptr inbounds float, float* %26, i64 %70
  %72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
  %73 = fadd float %67, %72
  %74 = or i32 %.02535, 1792
  %75 = add nuw nsw i32 %74, %8
  %76 = sext i32 %75 to i64
  %77 = getelementptr inbounds float, float* %26, i64 %76
  %78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
  %79 = fadd float %73, %78
  %80 = add nsw i32 %.02535, 2048
  %niter.nsub.7 = add i32 %niter, -8
  %niter.ncmp.7 = icmp eq i32 %niter.nsub.7, 0
  br i1 %niter.ncmp.7, label %.preheader.preheader.loopexit.unr-lcssa.loopexit, label %32, !llvm.loop !52

; <label>:81:                                     ; preds = %.preheader
  %.lcssa = phi float [ %85, %.preheader ]
  %82 = and i32 %7, 31
  %83 = icmp eq i32 %82, 0
  br i1 %83, label %88, label %90

.preheader:                                       ; preds = %.preheader.preheader, %.preheader
  %.033 = phi i32 [ %86, %.preheader ], [ 16, %.preheader.preheader ]
  %.132 = phi float [ %85, %.preheader ], [ %.132.ph, %.preheader.preheader ]
  %84 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.132, i32 %.033, i32 31) #3, !srcloc !53
  %85 = fadd float %.132, %84
  %86 = lshr i32 %.033, 1
  %87 = icmp eq i32 %86, 0
  br i1 %87, label %81, label %.preheader, !llvm.loop !54

; <label>:88:                                     ; preds = %81
  %89 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %3, float %.lcssa) #8
  br label %90

; <label>:90:                                     ; preds = %88, %81
  ret void
}

; Function Attrs: convergent nounwind
declare void @llvm.cuda.syncthreads() #3

; Function Attrs: argmemonly nounwind readonly
declare float @llvm.nvvm.ldg.global.f.f32.p0f32(float* nocapture, i32) #4

; Function Attrs: argmemonly nounwind
declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* nocapture, float) #5

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = shl nuw nsw i32 %6, 7
  %8 = add i32 %2, -1
  %9 = add i32 %8, %7
  %10 = udiv i32 %9, %7
  %11 = mul nsw i32 %10, %3
  %12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %13 = mul nuw nsw i32 %12, %6
  %14 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %15 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %16 = icmp eq i32 %12, 1
  br i1 %16, label %22, label %.preheader94

.preheader94.loopexit:                            ; preds = %.lr.ph109
  br label %.preheader94

.preheader94:                                     ; preds = %.preheader94.loopexit, %22, %5
  %17 = icmp slt i32 %14, %11
  br i1 %17, label %.lr.ph106, label %._crit_edge

.lr.ph106:                                        ; preds = %.preheader94
  %18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i64 0, i32 10, i32 0
  %19 = load float*, float** %18, align 8
  %20 = and i32 %15, 31
  %21 = icmp eq i32 %20, 0
  br label %30

; <label>:22:                                     ; preds = %5
  %23 = mul nuw nsw i32 %14, %6
  %24 = add nuw nsw i32 %23, %15
  %25 = icmp slt i32 %24, %3
  br i1 %25, label %.lr.ph109.preheader, label %.preheader94

.lr.ph109.preheader:                              ; preds = %22
  br label %.lr.ph109

.lr.ph109:                                        ; preds = %.lr.ph109.preheader, %.lr.ph109
  %.081107 = phi i32 [ %28, %.lr.ph109 ], [ %24, %.lr.ph109.preheader ]
  %26 = sext i32 %.081107 to i64
  %27 = getelementptr inbounds float, float* %4, i64 %26
  store float 0.000000e+00, float* %27, align 4
  %28 = add nsw i32 %.081107, %13
  %29 = icmp slt i32 %28, %3
  br i1 %29, label %.lr.ph109, label %.preheader94.loopexit

._crit_edge.loopexit:                             ; preds = %177
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %.preheader94
  ret void

; <label>:30:                                     ; preds = %.lr.ph106, %177
  %.083105 = phi i32 [ %14, %.lr.ph106 ], [ %178, %177 ]
  %31 = sdiv i32 %.083105, %10
  %32 = icmp slt i32 %31, %3
  br i1 %32, label %33, label %177

; <label>:33:                                     ; preds = %30
  %34 = srem i32 %.083105, %10
  %35 = mul i32 %7, %34
  %36 = add i32 %35, %15
  %37 = mul nsw i32 %31, %2
  %38 = add i32 %36, %37
  br label %39

; <label>:39:                                     ; preds = %33, %.preheader.preheader
  %.086100 = phi i32 [ 0, %33 ], [ %40, %.preheader.preheader ]
  %.09299 = phi float [ 0.000000e+00, %33 ], [ %155, %.preheader.preheader ]
  %40 = add nuw nsw i32 %.086100, 16
  %41 = or i32 %.086100, 15
  %42 = mul i32 %41, %6
  %43 = add i32 %42, %36
  %44 = icmp slt i32 %43, %2
  %45 = mul i32 %.086100, %6
  br i1 %44, label %.preheader.preheader, label %157

.preheader.preheader:                             ; preds = %39
  %46 = add i32 %38, %45
  %47 = sext i32 %46 to i64
  %48 = getelementptr inbounds float, float* %19, i64 %47
  %49 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %48, i32 4) #8
  %50 = fadd float %.09299, %49
  %51 = or i32 %.086100, 1
  %52 = mul i32 %51, %6
  %53 = add i32 %38, %52
  %54 = sext i32 %53 to i64
  %55 = getelementptr inbounds float, float* %19, i64 %54
  %56 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %55, i32 4) #8
  %57 = fadd float %50, %56
  %58 = or i32 %.086100, 2
  %59 = mul i32 %58, %6
  %60 = add i32 %38, %59
  %61 = sext i32 %60 to i64
  %62 = getelementptr inbounds float, float* %19, i64 %61
  %63 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %62, i32 4) #8
  %64 = fadd float %57, %63
  %65 = or i32 %.086100, 3
  %66 = mul i32 %65, %6
  %67 = add i32 %38, %66
  %68 = sext i32 %67 to i64
  %69 = getelementptr inbounds float, float* %19, i64 %68
  %70 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %69, i32 4) #8
  %71 = fadd float %64, %70
  %72 = or i32 %.086100, 4
  %73 = mul i32 %72, %6
  %74 = add i32 %38, %73
  %75 = sext i32 %74 to i64
  %76 = getelementptr inbounds float, float* %19, i64 %75
  %77 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %76, i32 4) #8
  %78 = fadd float %71, %77
  %79 = or i32 %.086100, 5
  %80 = mul i32 %79, %6
  %81 = add i32 %38, %80
  %82 = sext i32 %81 to i64
  %83 = getelementptr inbounds float, float* %19, i64 %82
  %84 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %83, i32 4) #8
  %85 = fadd float %78, %84
  %86 = or i32 %.086100, 6
  %87 = mul i32 %86, %6
  %88 = add i32 %38, %87
  %89 = sext i32 %88 to i64
  %90 = getelementptr inbounds float, float* %19, i64 %89
  %91 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %90, i32 4) #8
  %92 = fadd float %85, %91
  %93 = or i32 %.086100, 7
  %94 = mul i32 %93, %6
  %95 = add i32 %38, %94
  %96 = sext i32 %95 to i64
  %97 = getelementptr inbounds float, float* %19, i64 %96
  %98 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %97, i32 4) #8
  %99 = fadd float %92, %98
  %100 = or i32 %.086100, 8
  %101 = mul i32 %100, %6
  %102 = add i32 %38, %101
  %103 = sext i32 %102 to i64
  %104 = getelementptr inbounds float, float* %19, i64 %103
  %105 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %104, i32 4) #8
  %106 = fadd float %99, %105
  %107 = or i32 %.086100, 9
  %108 = mul i32 %107, %6
  %109 = add i32 %38, %108
  %110 = sext i32 %109 to i64
  %111 = getelementptr inbounds float, float* %19, i64 %110
  %112 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %111, i32 4) #8
  %113 = fadd float %106, %112
  %114 = or i32 %.086100, 10
  %115 = mul i32 %114, %6
  %116 = add i32 %38, %115
  %117 = sext i32 %116 to i64
  %118 = getelementptr inbounds float, float* %19, i64 %117
  %119 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %118, i32 4) #8
  %120 = fadd float %113, %119
  %121 = or i32 %.086100, 11
  %122 = mul i32 %121, %6
  %123 = add i32 %38, %122
  %124 = sext i32 %123 to i64
  %125 = getelementptr inbounds float, float* %19, i64 %124
  %126 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %125, i32 4) #8
  %127 = fadd float %120, %126
  %128 = or i32 %.086100, 12
  %129 = mul i32 %128, %6
  %130 = add i32 %38, %129
  %131 = sext i32 %130 to i64
  %132 = getelementptr inbounds float, float* %19, i64 %131
  %133 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %132, i32 4) #8
  %134 = fadd float %127, %133
  %135 = or i32 %.086100, 13
  %136 = mul i32 %135, %6
  %137 = add i32 %38, %136
  %138 = sext i32 %137 to i64
  %139 = getelementptr inbounds float, float* %19, i64 %138
  %140 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %139, i32 4) #8
  %141 = fadd float %134, %140
  %142 = or i32 %.086100, 14
  %143 = mul i32 %142, %6
  %144 = add i32 %38, %143
  %145 = sext i32 %144 to i64
  %146 = getelementptr inbounds float, float* %19, i64 %145
  %147 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %146, i32 4) #8
  %148 = fadd float %141, %147
  %149 = or i32 %.086100, 15
  %150 = mul i32 %149, %6
  %151 = add i32 %38, %150
  %152 = sext i32 %151 to i64
  %153 = getelementptr inbounds float, float* %19, i64 %152
  %154 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %153, i32 4) #8
  %155 = fadd float %148, %154
  %156 = icmp slt i32 %40, 128
  br i1 %156, label %39, label %.critedge.loopexit125

; <label>:157:                                    ; preds = %39
  %.lcssa = phi i32 [ %45, %39 ]
  %.09299.lcssa = phi float [ %.09299, %39 ]
  %158 = add i32 %.lcssa, %36
  %159 = icmp slt i32 %158, %2
  br i1 %159, label %.lr.ph.preheader, label %.critedge

.lr.ph.preheader:                                 ; preds = %157
  br label %.lr.ph

.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
  %.084102 = phi i32 [ %165, %.lr.ph ], [ %158, %.lr.ph.preheader ]
  %.1101 = phi float [ %164, %.lr.ph ], [ %.09299.lcssa, %.lr.ph.preheader ]
  %160 = add nsw i32 %.084102, %37
  %161 = sext i32 %160 to i64
  %162 = getelementptr inbounds float, float* %19, i64 %161
  %163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
  %164 = fadd float %.1101, %163
  %165 = add i32 %.084102, %6
  %166 = icmp slt i32 %165, %2
  br i1 %166, label %.lr.ph, label %.critedge.loopexit

.critedge.loopexit:                               ; preds = %.lr.ph
  %.lcssa134 = phi float [ %164, %.lr.ph ]
  br label %.critedge

.critedge.loopexit125:                            ; preds = %.preheader.preheader
  %.lcssa133 = phi float [ %155, %.preheader.preheader ]
  br label %.critedge

.critedge:                                        ; preds = %.critedge.loopexit125, %.critedge.loopexit, %157
  %.3 = phi float [ %.09299.lcssa, %157 ], [ %.lcssa134, %.critedge.loopexit ], [ %.lcssa133, %.critedge.loopexit125 ]
  tail call void @llvm.cuda.syncthreads()
  br label %168

; <label>:167:                                    ; preds = %168
  %.lcssa135 = phi float [ %170, %168 ]
  br i1 %21, label %173, label %177

; <label>:168:                                    ; preds = %.critedge, %168
  %.0104 = phi i32 [ 16, %.critedge ], [ %171, %168 ]
  %.4103 = phi float [ %.3, %.critedge ], [ %170, %168 ]
  %169 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.4103, i32 %.0104, i32 31) #3, !srcloc !53
  %170 = fadd float %.4103, %169
  %171 = lshr i32 %.0104, 1
  %172 = icmp eq i32 %171, 0
  br i1 %172, label %167, label %168

; <label>:173:                                    ; preds = %167
  %174 = sext i32 %31 to i64
  %175 = getelementptr inbounds float, float* %4, i64 %174
  %176 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %175, float %.lcssa135) #8
  br label %177

; <label>:177:                                    ; preds = %167, %173, %30
  tail call void @llvm.cuda.syncthreads()
  %178 = add i32 %.083105, %12
  %179 = icmp slt i32 %178, %11
  br i1 %179, label %30, label %._crit_edge.loopexit
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %8 = mul nuw nsw i32 %7, %6
  %9 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %10 = mul nuw nsw i32 %9, %6
  %11 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %12 = add nuw nsw i32 %10, %11
  %13 = icmp eq i32 %7, 1
  br i1 %13, label %.preheader, label %19

.preheader:                                       ; preds = %5
  %14 = icmp slt i32 %12, %3
  br i1 %14, label %.lr.ph60.preheader, label %._crit_edge61

.lr.ph60.preheader:                               ; preds = %.preheader
  br label %.lr.ph60

._crit_edge61.loopexit:                           ; preds = %.lr.ph60
  br label %._crit_edge61

._crit_edge61:                                    ; preds = %._crit_edge61.loopexit, %.preheader
  tail call void @llvm.cuda.syncthreads()
  br label %19

.lr.ph60:                                         ; preds = %.lr.ph60.preheader, %.lr.ph60
  %.059 = phi i32 [ %17, %.lr.ph60 ], [ %12, %.lr.ph60.preheader ]
  %15 = sext i32 %.059 to i64
  %16 = getelementptr inbounds float, float* %4, i64 %15
  store float 0.000000e+00, float* %16, align 4
  %17 = add nsw i32 %.059, %8
  %18 = icmp slt i32 %17, %3
  br i1 %18, label %.lr.ph60, label %._crit_edge61.loopexit

; <label>:19:                                     ; preds = %._crit_edge61, %5
  %20 = add i32 %2, 15
  %21 = sdiv i32 %20, 16
  %22 = mul nsw i32 %21, %3
  %23 = icmp slt i32 %12, %22
  br i1 %23, label %.lr.ph57, label %._crit_edge58

.lr.ph57:                                         ; preds = %19
  %24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i64 0, i32 10, i32 0
  %25 = load float*, float** %24, align 8
  br label %26

._crit_edge58.loopexit:                           ; preds = %._crit_edge
  br label %._crit_edge58

._crit_edge58:                                    ; preds = %._crit_edge58.loopexit, %19
  ret void

; <label>:26:                                     ; preds = %.lr.ph57, %._crit_edge
  %.04755 = phi i32 [ %12, %.lr.ph57 ], [ %36, %._crit_edge ]
  %27 = srem i32 %.04755, %3
  %28 = sdiv i32 %.04755, %3
  %29 = shl nsw i32 %28, 4
  %30 = add nsw i32 %29, 16
  %31 = icmp sgt i32 %30, %2
  %..i = select i1 %31, i32 %2, i32 %30
  %32 = icmp slt i32 %29, %..i
  br i1 %32, label %.lr.ph.preheader, label %._crit_edge

.lr.ph.preheader:                                 ; preds = %26
  br label %.lr.ph

._crit_edge.loopexit:                             ; preds = %.lr.ph
  %.lcssa = phi float [ %43, %.lr.ph ]
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %26
  %.052.lcssa = phi float [ 0.000000e+00, %26 ], [ %.lcssa, %._crit_edge.loopexit ]
  %33 = sext i32 %27 to i64
  %34 = getelementptr inbounds float, float* %4, i64 %33
  %35 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %34, float %.052.lcssa) #8
  %36 = add nsw i32 %.04755, %8
  %37 = icmp slt i32 %36, %22
  br i1 %37, label %26, label %._crit_edge58.loopexit

.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
  %.04654 = phi i32 [ %44, %.lr.ph ], [ %29, %.lr.ph.preheader ]
  %.05253 = phi float [ %43, %.lr.ph ], [ 0.000000e+00, %.lr.ph.preheader ]
  %38 = mul nsw i32 %.04654, %3
  %39 = add nsw i32 %38, %27
  %40 = sext i32 %39 to i64
  %41 = getelementptr inbounds float, float* %25, i64 %40
  %42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8
  %43 = fadd float %.05253, %42
  %44 = add nsw i32 %.04654, 1
  %45 = icmp slt i32 %44, %..i
  br i1 %45, label %.lr.ph, label %._crit_edge.loopexit
}

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.5"* byval align 8, i32) #0 comdat {
  %3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %5 = mul nuw nsw i32 %4, %3
  %6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %7 = add nuw nsw i32 %5, %6
  %8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %9 = mul nuw nsw i32 %8, %4
  %.sroa.444.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 0, i32 7
  %.sroa.444.0..sroa_cast = bitcast i32* %.sroa.444.0..sroa_idx to i64*
  %.sroa.444.0.copyload = load i64, i64* %.sroa.444.0..sroa_cast, align 8
  %.sroa.546.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 0, i32 9, i32 0, i64 0
  %.sroa.546.0.copyload = load i32, i32* %.sroa.546.0..sroa_idx, align 8
  %.sroa.750.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 0, i32 10, i32 0
  %.sroa.750.0.copyload = load float*, float** %.sroa.750.0..sroa_idx, align 8
  %.sroa.9.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 2
  %.sroa.9.0.copyload = load float*, float** %.sroa.9.0..sroa_idx, align 8
  %10 = icmp slt i32 %7, %1
  br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit

.lr.ph.i:                                         ; preds = %2
  %11 = trunc i64 %.sroa.444.0.copyload to i32
  %12 = icmp sgt i32 %.sroa.546.0.copyload, 0
  %13 = lshr i64 %.sroa.444.0.copyload, 32
  %14 = trunc i64 %13 to i32
  br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader

.lr.ph.split.i.preheader:                         ; preds = %.lr.ph.i
  br label %.lr.ph.split.i

.lr.ph.split.us.i.preheader:                      ; preds = %.lr.ph.i
  %15 = add i32 %.sroa.546.0.copyload, -1
  %xtraiter = and i32 %.sroa.546.0.copyload, 3
  %16 = icmp ult i32 %15, 3
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  %unroll_iter = sub i32 %.sroa.546.0.copyload, %xtraiter
  br label %.lr.ph.split.us.i

.lr.ph.split.us.i:                                ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
  %.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ]
  %17 = mul nsw i32 %.07.us.i, %11
  br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new

.lr.ph.split.us.i.new:                            ; preds = %.lr.ph.split.us.i
  br label %18

; <label>:18:                                     ; preds = %18, %.lr.ph.split.us.i.new
  %19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ]
  %.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ]
  %20 = mul nsw i32 %.012.i.i.i.us.i, %14
  %21 = add nsw i32 %20, %17
  %22 = sext i32 %21 to i64
  %23 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %22
  %24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8
  %25 = fadd float %19, %24
  %26 = or i32 %.012.i.i.i.us.i, 1
  %27 = mul nsw i32 %26, %14
  %28 = add nsw i32 %27, %17
  %29 = sext i32 %28 to i64
  %30 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %29
  %31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8
  %32 = fadd float %25, %31
  %33 = or i32 %.012.i.i.i.us.i, 2
  %34 = mul nsw i32 %33, %14
  %35 = add nsw i32 %34, %17
  %36 = sext i32 %35 to i64
  %37 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %36
  %38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8
  %39 = fadd float %32, %38
  %40 = or i32 %.012.i.i.i.us.i, 3
  %41 = mul nsw i32 %40, %14
  %42 = add nsw i32 %41, %17
  %43 = sext i32 %42 to i64
  %44 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %43
  %45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8
  %46 = fadd float %39, %45
  %47 = add nsw i32 %.012.i.i.i.us.i, 4
  %niter.nsub.3 = add i32 %niter, -4
  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
  br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18

_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18
  %.lcssa66 = phi i32 [ %47, %18 ]
  %.lcssa65 = phi float [ %46, %18 ]
  br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa

_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i
  %.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  %.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  %.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader

.epil.preheader:                                  ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
  br label %48

; <label>:48:                                     ; preds = %48, %.epil.preheader
  %49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ]
  %.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ]
  %epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ]
  %50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14
  %51 = add nsw i32 %50, %17
  %52 = sext i32 %51 to i64
  %53 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %52
  %54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
  %55 = fadd float %49, %54
  %56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !56

_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48
  %.lcssa67 = phi float [ %55, %48 ]
  br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i

_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa
  %.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ]
  %57 = sext i32 %.07.us.i to i64
  %58 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %57
  store float %.lcssa, float* %58, align 4
  %59 = add nsw i32 %.07.us.i, %9
  %60 = icmp slt i32 %59, %1
  br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit

.lr.ph.split.i:                                   ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i
  %.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ]
  %61 = sext i32 %.07.i to i64
  %62 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %61
  store float 0.000000e+00, float* %62, align 4
  %63 = add nsw i32 %.07.i, %9
  %64 = icmp slt i32 %63, %1
  br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63: ; preds = %.lr.ph.split.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit, %2
  ret void
}

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_(float, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
  %4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %6 = mul nuw nsw i32 %5, %4
  %7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %8 = add nuw nsw i32 %6, %7
  %9 = icmp slt i32 %8, %1
  br i1 %9, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %3
  %10 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %2, i64 0, i32 0
  %11 = load float*, float** %10, align 8
  %12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %13 = mul nuw nsw i32 %12, %5
  br label %14

._crit_edge.loopexit:                             ; preds = %14
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %3
  ret void

; <label>:14:                                     ; preds = %.lr.ph, %14
  %.08 = phi i32 [ %8, %.lr.ph ], [ %17, %14 ]
  %15 = sext i32 %.08 to i64
  %16 = getelementptr inbounds float, float* %11, i64 %15
  store float %0, float* %16, align 4
  %17 = add i32 %13, %.08
  %18 = icmp slt i32 %17, %1
  br i1 %18, label %14, label %._crit_edge.loopexit
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 32
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %3, 32767
  %31 = sdiv i32 %30, 32768
  %32 = mul nsw i32 %31, %2
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = icmp slt i32 %33, %32
  br i1 %35, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %36 = and i32 %34, 31
  %.not = icmp ne i32 %36, 0
  %37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
  %38 = load float*, float** %37, align 8
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
  %40 = load float*, float** %39, align 8
  br label %41

._crit_edge.loopexit:                             ; preds = %187
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %29
  ret void

; <label>:41:                                     ; preds = %.lr.ph, %187
  %.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ]
  %42 = srem i32 %.0114, %31
  %43 = sdiv i32 %.0114, %31
  %44 = shl nsw i32 %42, 15
  %45 = or i32 %44, %34
  %46 = icmp slt i32 %43, %2
  br i1 %46, label %.preheader102, label %.thread.preheader

.thread.preheader.loopexit:                       ; preds = %.preheader.preheader
  %.lcssa137 = phi float [ %164, %.preheader.preheader ]
  br label %.thread.preheader

.thread.preheader:                                ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41
  %.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
  br label %.thread

.preheader102:                                    ; preds = %41
  %47 = mul nsw i32 %43, %3
  %48 = add i32 %47, %45
  br label %49

; <label>:49:                                     ; preds = %.preheader102, %.preheader.preheader
  %.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ]
  %.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ]
  %50 = add nuw nsw i32 %.098108, 16
  %51 = shl i32 %.098108, 8
  %52 = or i32 %51, 3840
  %53 = add nsw i32 %52, %45
  %54 = icmp slt i32 %53, %3
  br i1 %54, label %.preheader.preheader, label %.preheader101

.preheader.preheader:                             ; preds = %49
  %55 = add i32 %48, %51
  %56 = sext i32 %55 to i64
  %57 = getelementptr inbounds float, float* %40, i64 %56
  %58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8
  %59 = fadd float %.095109, %58
  %60 = shl i32 %.098108, 8
  %61 = or i32 %60, 256
  %62 = add i32 %48, %61
  %63 = sext i32 %62 to i64
  %64 = getelementptr inbounds float, float* %40, i64 %63
  %65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8
  %66 = fadd float %59, %65
  %67 = shl i32 %.098108, 8
  %68 = or i32 %67, 512
  %69 = add i32 %48, %68
  %70 = sext i32 %69 to i64
  %71 = getelementptr inbounds float, float* %40, i64 %70
  %72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
  %73 = fadd float %66, %72
  %74 = shl i32 %.098108, 8
  %75 = or i32 %74, 768
  %76 = add i32 %48, %75
  %77 = sext i32 %76 to i64
  %78 = getelementptr inbounds float, float* %40, i64 %77
  %79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8
  %80 = fadd float %73, %79
  %81 = shl i32 %.098108, 8
  %82 = or i32 %81, 1024
  %83 = add i32 %48, %82
  %84 = sext i32 %83 to i64
  %85 = getelementptr inbounds float, float* %40, i64 %84
  %86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8
  %87 = fadd float %80, %86
  %88 = shl i32 %.098108, 8
  %89 = or i32 %88, 1280
  %90 = add i32 %48, %89
  %91 = sext i32 %90 to i64
  %92 = getelementptr inbounds float, float* %40, i64 %91
  %93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8
  %94 = fadd float %87, %93
  %95 = shl i32 %.098108, 8
  %96 = or i32 %95, 1536
  %97 = add i32 %48, %96
  %98 = sext i32 %97 to i64
  %99 = getelementptr inbounds float, float* %40, i64 %98
  %100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8
  %101 = fadd float %94, %100
  %102 = shl i32 %.098108, 8
  %103 = or i32 %102, 1792
  %104 = add i32 %48, %103
  %105 = sext i32 %104 to i64
  %106 = getelementptr inbounds float, float* %40, i64 %105
  %107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8
  %108 = fadd float %101, %107
  %109 = shl i32 %.098108, 8
  %110 = or i32 %109, 2048
  %111 = add i32 %48, %110
  %112 = sext i32 %111 to i64
  %113 = getelementptr inbounds float, float* %40, i64 %112
  %114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8
  %115 = fadd float %108, %114
  %116 = shl i32 %.098108, 8
  %117 = or i32 %116, 2304
  %118 = add i32 %48, %117
  %119 = sext i32 %118 to i64
  %120 = getelementptr inbounds float, float* %40, i64 %119
  %121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8
  %122 = fadd float %115, %121
  %123 = shl i32 %.098108, 8
  %124 = or i32 %123, 2560
  %125 = add i32 %48, %124
  %126 = sext i32 %125 to i64
  %127 = getelementptr inbounds float, float* %40, i64 %126
  %128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8
  %129 = fadd float %122, %128
  %130 = shl i32 %.098108, 8
  %131 = or i32 %130, 2816
  %132 = add i32 %48, %131
  %133 = sext i32 %132 to i64
  %134 = getelementptr inbounds float, float* %40, i64 %133
  %135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8
  %136 = fadd float %129, %135
  %137 = shl i32 %.098108, 8
  %138 = or i32 %137, 3072
  %139 = add i32 %48, %138
  %140 = sext i32 %139 to i64
  %141 = getelementptr inbounds float, float* %40, i64 %140
  %142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8
  %143 = fadd float %136, %142
  %144 = shl i32 %.098108, 8
  %145 = or i32 %144, 3328
  %146 = add i32 %48, %145
  %147 = sext i32 %146 to i64
  %148 = getelementptr inbounds float, float* %40, i64 %147
  %149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8
  %150 = fadd float %143, %149
  %151 = shl i32 %.098108, 8
  %152 = or i32 %151, 3584
  %153 = add i32 %48, %152
  %154 = sext i32 %153 to i64
  %155 = getelementptr inbounds float, float* %40, i64 %154
  %156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8
  %157 = fadd float %150, %156
  %158 = shl i32 %.098108, 8
  %159 = or i32 %158, 3840
  %160 = add i32 %48, %159
  %161 = sext i32 %160 to i64
  %162 = getelementptr inbounds float, float* %40, i64 %161
  %163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
  %164 = fadd float %157, %163
  %165 = icmp slt i32 %50, 128
  br i1 %165, label %49, label %.thread.preheader.loopexit

.preheader101:                                    ; preds = %49
  %.lcssa = phi i32 [ %51, %49 ]
  %.098108.lcssa = phi i32 [ %.098108, %49 ]
  %.095109.lcssa = phi float [ %.095109, %49 ]
  %166 = add nsw i32 %.lcssa, %45
  %167 = icmp slt i32 %166, %3
  br i1 %167, label %168, label %.thread.preheader

; <label>:168:                                    ; preds = %.preheader101
  %169 = add nsw i32 %166, %47
  %170 = sext i32 %169 to i64
  %171 = getelementptr inbounds float, float* %40, i64 %170
  %172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
  %173 = fadd float %.095109.lcssa, %172
  %174 = shl i32 %.098108.lcssa, 8
  %175 = or i32 %174, 256
  %176 = add nsw i32 %175, %45
  %177 = icmp slt i32 %176, %3
  br i1 %177, label %190, label %.thread.preheader

; <label>:178:                                    ; preds = %.thread
  %.lcssa138 = phi float [ %180, %.thread ]
  %.not99 = xor i1 %46, true
  %brmerge = or i1 %.not, %.not99
  br i1 %brmerge, label %187, label %183

.thread:                                          ; preds = %.thread.preheader, %.thread
  %.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
  %.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
  %179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
  %180 = fadd float %.8112, %179
  %181 = lshr i32 %.092113, 1
  %182 = icmp eq i32 %181, 0
  br i1 %182, label %178, label %.thread, !llvm.loop !58

; <label>:183:                                    ; preds = %178
  %184 = sext i32 %43 to i64
  %185 = getelementptr inbounds float, float* %38, i64 %184
  %186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8
  br label %187

; <label>:187:                                    ; preds = %178, %183
  %188 = add nuw nsw i32 %.0114, 32
  %189 = icmp slt i32 %188, %32
  br i1 %189, label %41, label %._crit_edge.loopexit

; <label>:190:                                    ; preds = %168
  %191 = add nsw i32 %176, %47
  %192 = sext i32 %191 to i64
  %193 = getelementptr inbounds float, float* %40, i64 %192
  %194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8
  %195 = fadd float %173, %194
  %196 = shl i32 %.098108.lcssa, 8
  %197 = or i32 %196, 512
  %198 = add nsw i32 %197, %45
  %199 = icmp slt i32 %198, %3
  br i1 %199, label %200, label %.thread.preheader

; <label>:200:                                    ; preds = %190
  %201 = add nsw i32 %198, %47
  %202 = sext i32 %201 to i64
  %203 = getelementptr inbounds float, float* %40, i64 %202
  %204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8
  %205 = fadd float %195, %204
  %206 = shl i32 %.098108.lcssa, 8
  %207 = or i32 %206, 768
  %208 = add nsw i32 %207, %45
  %209 = icmp slt i32 %208, %3
  br i1 %209, label %210, label %.thread.preheader

; <label>:210:                                    ; preds = %200
  %211 = add nsw i32 %208, %47
  %212 = sext i32 %211 to i64
  %213 = getelementptr inbounds float, float* %40, i64 %212
  %214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8
  %215 = fadd float %205, %214
  %216 = shl i32 %.098108.lcssa, 8
  %217 = or i32 %216, 1024
  %218 = add nsw i32 %217, %45
  %219 = icmp slt i32 %218, %3
  br i1 %219, label %220, label %.thread.preheader

; <label>:220:                                    ; preds = %210
  %221 = add nsw i32 %218, %47
  %222 = sext i32 %221 to i64
  %223 = getelementptr inbounds float, float* %40, i64 %222
  %224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8
  %225 = fadd float %215, %224
  %226 = shl i32 %.098108.lcssa, 8
  %227 = or i32 %226, 1280
  %228 = add nsw i32 %227, %45
  %229 = icmp slt i32 %228, %3
  br i1 %229, label %230, label %.thread.preheader

; <label>:230:                                    ; preds = %220
  %231 = add nsw i32 %228, %47
  %232 = sext i32 %231 to i64
  %233 = getelementptr inbounds float, float* %40, i64 %232
  %234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8
  %235 = fadd float %225, %234
  %236 = shl i32 %.098108.lcssa, 8
  %237 = or i32 %236, 1536
  %238 = add nsw i32 %237, %45
  %239 = icmp slt i32 %238, %3
  br i1 %239, label %240, label %.thread.preheader

; <label>:240:                                    ; preds = %230
  %241 = add nsw i32 %238, %47
  %242 = sext i32 %241 to i64
  %243 = getelementptr inbounds float, float* %40, i64 %242
  %244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8
  %245 = fadd float %235, %244
  %246 = shl i32 %.098108.lcssa, 8
  %247 = or i32 %246, 1792
  %248 = add nsw i32 %247, %45
  %249 = icmp slt i32 %248, %3
  br i1 %249, label %250, label %.thread.preheader

; <label>:250:                                    ; preds = %240
  %251 = add nsw i32 %248, %47
  %252 = sext i32 %251 to i64
  %253 = getelementptr inbounds float, float* %40, i64 %252
  %254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8
  %255 = fadd float %245, %254
  %256 = shl i32 %.098108.lcssa, 8
  %257 = or i32 %256, 2048
  %258 = add nsw i32 %257, %45
  %259 = icmp slt i32 %258, %3
  br i1 %259, label %260, label %.thread.preheader

; <label>:260:                                    ; preds = %250
  %261 = add nsw i32 %258, %47
  %262 = sext i32 %261 to i64
  %263 = getelementptr inbounds float, float* %40, i64 %262
  %264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8
  %265 = fadd float %255, %264
  %266 = shl i32 %.098108.lcssa, 8
  %267 = or i32 %266, 2304
  %268 = add nsw i32 %267, %45
  %269 = icmp slt i32 %268, %3
  br i1 %269, label %270, label %.thread.preheader

; <label>:270:                                    ; preds = %260
  %271 = add nsw i32 %268, %47
  %272 = sext i32 %271 to i64
  %273 = getelementptr inbounds float, float* %40, i64 %272
  %274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8
  %275 = fadd float %265, %274
  %276 = shl i32 %.098108.lcssa, 8
  %277 = or i32 %276, 2560
  %278 = add nsw i32 %277, %45
  %279 = icmp slt i32 %278, %3
  br i1 %279, label %280, label %.thread.preheader

; <label>:280:                                    ; preds = %270
  %281 = add nsw i32 %278, %47
  %282 = sext i32 %281 to i64
  %283 = getelementptr inbounds float, float* %40, i64 %282
  %284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8
  %285 = fadd float %275, %284
  %286 = shl i32 %.098108.lcssa, 8
  %287 = or i32 %286, 2816
  %288 = add nsw i32 %287, %45
  %289 = icmp slt i32 %288, %3
  br i1 %289, label %290, label %.thread.preheader

; <label>:290:                                    ; preds = %280
  %291 = add nsw i32 %288, %47
  %292 = sext i32 %291 to i64
  %293 = getelementptr inbounds float, float* %40, i64 %292
  %294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8
  %295 = fadd float %285, %294
  %296 = shl i32 %.098108.lcssa, 8
  %297 = or i32 %296, 3072
  %298 = add nsw i32 %297, %45
  %299 = icmp slt i32 %298, %3
  br i1 %299, label %300, label %.thread.preheader

; <label>:300:                                    ; preds = %290
  %301 = add nsw i32 %298, %47
  %302 = sext i32 %301 to i64
  %303 = getelementptr inbounds float, float* %40, i64 %302
  %304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8
  %305 = fadd float %295, %304
  %306 = shl i32 %.098108.lcssa, 8
  %307 = or i32 %306, 3328
  %308 = add nsw i32 %307, %45
  %309 = icmp slt i32 %308, %3
  br i1 %309, label %310, label %.thread.preheader

; <label>:310:                                    ; preds = %300
  %311 = add nsw i32 %308, %47
  %312 = sext i32 %311 to i64
  %313 = getelementptr inbounds float, float* %40, i64 %312
  %314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8
  %315 = fadd float %305, %314
  %316 = shl i32 %.098108.lcssa, 8
  %317 = or i32 %316, 3584
  %318 = add nsw i32 %317, %45
  %319 = icmp slt i32 %318, %3
  br i1 %319, label %320, label %.thread.preheader

; <label>:320:                                    ; preds = %310
  %321 = add nsw i32 %318, %47
  %322 = sext i32 %321 to i64
  %323 = getelementptr inbounds float, float* %40, i64 %322
  %324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8
  %325 = fadd float %315, %324
  br label %.thread.preheader
}

; Function Attrs: convergent inlinehint noreturn nounwind
define internal fastcc void @_ZL13__assert_failPKcS0_jS0_(i8*, i32, i8*) unnamed_addr #6 {
  tail call void @__assertfail(i8* %0, i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i64 0, i64 0), i32 %1, i8* %2, i64 1) #10
  unreachable
}

; Function Attrs: convergent noreturn
declare void @__assertfail(i8*, i8*, i32, i8*, i64) #7

; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ntid.y() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ntid.z() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.nctaid.y() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.nctaid.z() #1

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 32
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %3, 32767
  %31 = sdiv i32 %30, 32768
  %32 = mul nsw i32 %31, %2
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = icmp slt i32 %33, %32
  br i1 %35, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
  %36 = and i32 %34, 31
  %.not = icmp ne i32 %36, 0
  %37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
  %38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
  br label %39

._crit_edge.loopexit:                             ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %29
  ret void

; <label>:39:                                     ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  %.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ]
  %40 = srem i32 %.0114, %31
  %41 = sdiv i32 %.0114, %31
  %42 = shl nsw i32 %40, 15
  %43 = or i32 %42, %34
  %.idx.val = load float, float* %.idx, align 4
  %44 = icmp slt i32 %41, %2
  br i1 %44, label %.preheader102, label %.thread.preheader

.thread.preheader.loopexit:                       ; preds = %.preheader.preheader
  %.lcssa137 = phi float [ %163, %.preheader.preheader ]
  br label %.thread.preheader

.thread.preheader:                                ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39
  %.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
  br label %.thread

.preheader102:                                    ; preds = %39
  %45 = mul nsw i32 %41, %3
  %46 = add i32 %45, %43
  %47 = load float*, float** %38, align 8
  br label %48

; <label>:48:                                     ; preds = %.preheader102, %.preheader.preheader
  %.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ]
  %.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ]
  %49 = add nuw nsw i32 %.098108, 16
  %50 = shl i32 %.098108, 8
  %51 = or i32 %50, 3840
  %52 = add nsw i32 %51, %43
  %53 = icmp slt i32 %52, %3
  br i1 %53, label %.preheader.preheader, label %.preheader101

.preheader.preheader:                             ; preds = %48
  %54 = add i32 %46, %50
  %55 = sext i32 %54 to i64
  %56 = getelementptr inbounds float, float* %47, i64 %55
  %57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8
  %58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8
  %59 = shl i32 %.098108, 8
  %60 = or i32 %59, 256
  %61 = add i32 %46, %60
  %62 = sext i32 %61 to i64
  %63 = getelementptr inbounds float, float* %47, i64 %62
  %64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8
  %65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8
  %66 = shl i32 %.098108, 8
  %67 = or i32 %66, 512
  %68 = add i32 %46, %67
  %69 = sext i32 %68 to i64
  %70 = getelementptr inbounds float, float* %47, i64 %69
  %71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8
  %72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8
  %73 = shl i32 %.098108, 8
  %74 = or i32 %73, 768
  %75 = add i32 %46, %74
  %76 = sext i32 %75 to i64
  %77 = getelementptr inbounds float, float* %47, i64 %76
  %78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
  %79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8
  %80 = shl i32 %.098108, 8
  %81 = or i32 %80, 1024
  %82 = add i32 %46, %81
  %83 = sext i32 %82 to i64
  %84 = getelementptr inbounds float, float* %47, i64 %83
  %85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8
  %86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8
  %87 = shl i32 %.098108, 8
  %88 = or i32 %87, 1280
  %89 = add i32 %46, %88
  %90 = sext i32 %89 to i64
  %91 = getelementptr inbounds float, float* %47, i64 %90
  %92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8
  %93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8
  %94 = shl i32 %.098108, 8
  %95 = or i32 %94, 1536
  %96 = add i32 %46, %95
  %97 = sext i32 %96 to i64
  %98 = getelementptr inbounds float, float* %47, i64 %97
  %99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8
  %100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8
  %101 = shl i32 %.098108, 8
  %102 = or i32 %101, 1792
  %103 = add i32 %46, %102
  %104 = sext i32 %103 to i64
  %105 = getelementptr inbounds float, float* %47, i64 %104
  %106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8
  %107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8
  %108 = shl i32 %.098108, 8
  %109 = or i32 %108, 2048
  %110 = add i32 %46, %109
  %111 = sext i32 %110 to i64
  %112 = getelementptr inbounds float, float* %47, i64 %111
  %113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8
  %114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8
  %115 = shl i32 %.098108, 8
  %116 = or i32 %115, 2304
  %117 = add i32 %46, %116
  %118 = sext i32 %117 to i64
  %119 = getelementptr inbounds float, float* %47, i64 %118
  %120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8
  %121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8
  %122 = shl i32 %.098108, 8
  %123 = or i32 %122, 2560
  %124 = add i32 %46, %123
  %125 = sext i32 %124 to i64
  %126 = getelementptr inbounds float, float* %47, i64 %125
  %127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8
  %128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8
  %129 = shl i32 %.098108, 8
  %130 = or i32 %129, 2816
  %131 = add i32 %46, %130
  %132 = sext i32 %131 to i64
  %133 = getelementptr inbounds float, float* %47, i64 %132
  %134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8
  %135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8
  %136 = shl i32 %.098108, 8
  %137 = or i32 %136, 3072
  %138 = add i32 %46, %137
  %139 = sext i32 %138 to i64
  %140 = getelementptr inbounds float, float* %47, i64 %139
  %141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8
  %142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8
  %143 = shl i32 %.098108, 8
  %144 = or i32 %143, 3328
  %145 = add i32 %46, %144
  %146 = sext i32 %145 to i64
  %147 = getelementptr inbounds float, float* %47, i64 %146
  %148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8
  %149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8
  %150 = shl i32 %.098108, 8
  %151 = or i32 %150, 3584
  %152 = add i32 %46, %151
  %153 = sext i32 %152 to i64
  %154 = getelementptr inbounds float, float* %47, i64 %153
  %155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8
  %156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8
  %157 = shl i32 %.098108, 8
  %158 = or i32 %157, 3840
  %159 = add i32 %46, %158
  %160 = sext i32 %159 to i64
  %161 = getelementptr inbounds float, float* %47, i64 %160
  %162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8
  %163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8
  %164 = icmp slt i32 %49, 128
  br i1 %164, label %48, label %.thread.preheader.loopexit

.preheader101:                                    ; preds = %48
  %.lcssa = phi i32 [ %50, %48 ]
  %.098108.lcssa = phi i32 [ %.098108, %48 ]
  %.095109.lcssa = phi float [ %.095109, %48 ]
  %165 = load float*, float** %38, align 8
  %166 = add nsw i32 %.lcssa, %43
  %167 = icmp slt i32 %166, %3
  br i1 %167, label %168, label %.thread.preheader

; <label>:168:                                    ; preds = %.preheader101
  %169 = add nsw i32 %166, %45
  %170 = sext i32 %169 to i64
  %171 = getelementptr inbounds float, float* %165, i64 %170
  %172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
  %173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8
  %174 = shl i32 %.098108.lcssa, 8
  %175 = or i32 %174, 256
  %176 = add nsw i32 %175, %43
  %177 = icmp slt i32 %176, %3
  br i1 %177, label %198, label %.thread.preheader

; <label>:178:                                    ; preds = %.thread
  %.lcssa138 = phi float [ %180, %.thread ]
  %.not99 = xor i1 %44, true
  %brmerge = or i1 %.not, %.not99
  br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183

.thread:                                          ; preds = %.thread.preheader, %.thread
  %.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
  %.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
  %179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
  %180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8
  %181 = lshr i32 %.092113, 1
  %182 = icmp eq i32 %181, 0
  br i1 %182, label %178, label %.thread, !llvm.loop !59

; <label>:183:                                    ; preds = %178
  %184 = load float*, float** %37, align 8
  %185 = sext i32 %41 to i64
  %186 = getelementptr inbounds float, float* %184, i64 %185
  %187 = bitcast float %.lcssa138 to i32
  %188 = bitcast float* %186 to i32*
  %189 = load i32, i32* %188, align 4
  br label %190

; <label>:190:                                    ; preds = %193, %183
  %.011.i = phi i32 [ %189, %183 ], [ %195, %193 ]
  %191 = bitcast i32 %.011.i to float
  %192 = fcmp olt float %191, %.lcssa138
  br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit

; <label>:193:                                    ; preds = %190
  %194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst
  %195 = extractvalue { i32, i1 } %194, 0
  %not..i = icmp eq i32 %.011.i, %195
  br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193
  br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178
  %196 = add nuw nsw i32 %.0114, 32
  %197 = icmp slt i32 %196, %32
  br i1 %197, label %39, label %._crit_edge.loopexit

; <label>:198:                                    ; preds = %168
  %199 = add nsw i32 %176, %45
  %200 = sext i32 %199 to i64
  %201 = getelementptr inbounds float, float* %165, i64 %200
  %202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8
  %203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8
  %204 = shl i32 %.098108.lcssa, 8
  %205 = or i32 %204, 512
  %206 = add nsw i32 %205, %43
  %207 = icmp slt i32 %206, %3
  br i1 %207, label %208, label %.thread.preheader

; <label>:208:                                    ; preds = %198
  %209 = add nsw i32 %206, %45
  %210 = sext i32 %209 to i64
  %211 = getelementptr inbounds float, float* %165, i64 %210
  %212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8
  %213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8
  %214 = shl i32 %.098108.lcssa, 8
  %215 = or i32 %214, 768
  %216 = add nsw i32 %215, %43
  %217 = icmp slt i32 %216, %3
  br i1 %217, label %218, label %.thread.preheader

; <label>:218:                                    ; preds = %208
  %219 = add nsw i32 %216, %45
  %220 = sext i32 %219 to i64
  %221 = getelementptr inbounds float, float* %165, i64 %220
  %222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8
  %223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8
  %224 = shl i32 %.098108.lcssa, 8
  %225 = or i32 %224, 1024
  %226 = add nsw i32 %225, %43
  %227 = icmp slt i32 %226, %3
  br i1 %227, label %228, label %.thread.preheader

; <label>:228:                                    ; preds = %218
  %229 = add nsw i32 %226, %45
  %230 = sext i32 %229 to i64
  %231 = getelementptr inbounds float, float* %165, i64 %230
  %232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8
  %233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8
  %234 = shl i32 %.098108.lcssa, 8
  %235 = or i32 %234, 1280
  %236 = add nsw i32 %235, %43
  %237 = icmp slt i32 %236, %3
  br i1 %237, label %238, label %.thread.preheader

; <label>:238:                                    ; preds = %228
  %239 = add nsw i32 %236, %45
  %240 = sext i32 %239 to i64
  %241 = getelementptr inbounds float, float* %165, i64 %240
  %242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8
  %243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8
  %244 = shl i32 %.098108.lcssa, 8
  %245 = or i32 %244, 1536
  %246 = add nsw i32 %245, %43
  %247 = icmp slt i32 %246, %3
  br i1 %247, label %248, label %.thread.preheader

; <label>:248:                                    ; preds = %238
  %249 = add nsw i32 %246, %45
  %250 = sext i32 %249 to i64
  %251 = getelementptr inbounds float, float* %165, i64 %250
  %252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8
  %253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8
  %254 = shl i32 %.098108.lcssa, 8
  %255 = or i32 %254, 1792
  %256 = add nsw i32 %255, %43
  %257 = icmp slt i32 %256, %3
  br i1 %257, label %258, label %.thread.preheader

; <label>:258:                                    ; preds = %248
  %259 = add nsw i32 %256, %45
  %260 = sext i32 %259 to i64
  %261 = getelementptr inbounds float, float* %165, i64 %260
  %262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8
  %263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8
  %264 = shl i32 %.098108.lcssa, 8
  %265 = or i32 %264, 2048
  %266 = add nsw i32 %265, %43
  %267 = icmp slt i32 %266, %3
  br i1 %267, label %268, label %.thread.preheader

; <label>:268:                                    ; preds = %258
  %269 = add nsw i32 %266, %45
  %270 = sext i32 %269 to i64
  %271 = getelementptr inbounds float, float* %165, i64 %270
  %272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8
  %273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8
  %274 = shl i32 %.098108.lcssa, 8
  %275 = or i32 %274, 2304
  %276 = add nsw i32 %275, %43
  %277 = icmp slt i32 %276, %3
  br i1 %277, label %278, label %.thread.preheader

; <label>:278:                                    ; preds = %268
  %279 = add nsw i32 %276, %45
  %280 = sext i32 %279 to i64
  %281 = getelementptr inbounds float, float* %165, i64 %280
  %282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8
  %283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8
  %284 = shl i32 %.098108.lcssa, 8
  %285 = or i32 %284, 2560
  %286 = add nsw i32 %285, %43
  %287 = icmp slt i32 %286, %3
  br i1 %287, label %288, label %.thread.preheader

; <label>:288:                                    ; preds = %278
  %289 = add nsw i32 %286, %45
  %290 = sext i32 %289 to i64
  %291 = getelementptr inbounds float, float* %165, i64 %290
  %292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8
  %293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8
  %294 = shl i32 %.098108.lcssa, 8
  %295 = or i32 %294, 2816
  %296 = add nsw i32 %295, %43
  %297 = icmp slt i32 %296, %3
  br i1 %297, label %298, label %.thread.preheader

; <label>:298:                                    ; preds = %288
  %299 = add nsw i32 %296, %45
  %300 = sext i32 %299 to i64
  %301 = getelementptr inbounds float, float* %165, i64 %300
  %302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8
  %303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8
  %304 = shl i32 %.098108.lcssa, 8
  %305 = or i32 %304, 3072
  %306 = add nsw i32 %305, %43
  %307 = icmp slt i32 %306, %3
  br i1 %307, label %308, label %.thread.preheader

; <label>:308:                                    ; preds = %298
  %309 = add nsw i32 %306, %45
  %310 = sext i32 %309 to i64
  %311 = getelementptr inbounds float, float* %165, i64 %310
  %312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8
  %313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8
  %314 = shl i32 %.098108.lcssa, 8
  %315 = or i32 %314, 3328
  %316 = add nsw i32 %315, %43
  %317 = icmp slt i32 %316, %3
  br i1 %317, label %318, label %.thread.preheader

; <label>:318:                                    ; preds = %308
  %319 = add nsw i32 %316, %45
  %320 = sext i32 %319 to i64
  %321 = getelementptr inbounds float, float* %165, i64 %320
  %322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8
  %323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8
  %324 = shl i32 %.098108.lcssa, 8
  %325 = or i32 %324, 3584
  %326 = add nsw i32 %325, %43
  %327 = icmp slt i32 %326, %3
  br i1 %327, label %328, label %.thread.preheader

; <label>:328:                                    ; preds = %318
  %329 = add nsw i32 %326, %45
  %330 = sext i32 %329 to i64
  %331 = getelementptr inbounds float, float* %165, i64 %330
  %332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8
  %333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8
  br label %.thread.preheader
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 128
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %2, 15
  %31 = sdiv i32 %30, 16
  %32 = mul nsw i32 %31, %3
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = shl nuw nsw i32 %33, 8
  %36 = add nuw nsw i32 %35, %34
  %37 = icmp slt i32 %36, %32
  br i1 %37, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %38 = icmp sgt i32 %3, -1
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
  %40 = load float*, float** %39, align 8
  %41 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
  %42 = load float*, float** %41, align 8
  br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader

.lr.ph.split.preheader:                           ; preds = %.lr.ph
  %43 = add i32 %32, -1
  %44 = sub i32 %43, %34
  %45 = sub i32 %44, %35
  %46 = lshr i32 %45, 15
  %47 = add nuw nsw i32 %46, 1
  %xtraiter = and i32 %47, 3
  %48 = icmp ult i32 %45, 98304
  br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new

.lr.ph.split.preheader.new:                       ; preds = %.lr.ph.split.preheader
  %unroll_iter = sub nsw i32 %47, %xtraiter
  br label %.lr.ph.split

.lr.ph.split.us.preheader:                        ; preds = %.lr.ph
  br label %.lr.ph.split.us

.lr.ph.split.us:                                  ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us
  %.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ]
  %49 = srem i32 %.047.us, %3
  %50 = sdiv i32 %.047.us, %3
  %51 = srem i32 %50, %31
  %52 = shl nsw i32 %51, 4
  br label %53

; <label>:53:                                     ; preds = %104, %.lr.ph.split.us
  %.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ]
  %.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ]
  %54 = add nuw nsw i32 %.04346.us.us, %52
  %55 = icmp slt i32 %54, %2
  br i1 %55, label %56, label %62

; <label>:56:                                     ; preds = %53
  %57 = mul nsw i32 %54, %3
  %58 = add nsw i32 %57, %49
  %59 = sext i32 %58 to i64
  %60 = getelementptr inbounds float, float* %40, i64 %59
  %61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8
  br label %62

; <label>:62:                                     ; preds = %56, %53
  %63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ]
  %64 = fadd float %.04445.us.us, %63
  %65 = or i32 %.04346.us.us, 1
  %66 = add nuw nsw i32 %65, %52
  %67 = icmp slt i32 %66, %2
  br i1 %67, label %98, label %104

.us-lcssa.us.us:                                  ; preds = %104
  %.lcssa = phi float [ %106, %104 ]
  %68 = sext i32 %49 to i64
  %69 = getelementptr inbounds float, float* %42, i64 %68
  %70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8
  %71 = add nuw nsw i32 %.047.us, 32768
  %72 = icmp slt i32 %71, %32
  br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit

._crit_edge.loopexit:                             ; preds = %.us-lcssa.us.us
  br label %._crit_edge

._crit_edge.loopexit59.unr-lcssa.loopexit:        ; preds = %.lr.ph.split
  %.lcssa60 = phi i32 [ %97, %.lr.ph.split ]
  br label %._crit_edge.loopexit59.unr-lcssa

._crit_edge.loopexit59.unr-lcssa:                 ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader
  %.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ]
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader

.lr.ph.split.epil.preheader:                      ; preds = %._crit_edge.loopexit59.unr-lcssa
  br label %.lr.ph.split.epil

.lr.ph.split.epil:                                ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader
  %.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ]
  %epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ]
  %73 = srem i32 %.047.epil, %3
  %74 = sext i32 %73 to i64
  %75 = getelementptr inbounds float, float* %42, i64 %74
  %76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8
  %77 = add nuw nsw i32 %.047.epil, 32768
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !60

._crit_edge.loopexit59.epilog-lcssa:              ; preds = %.lr.ph.split.epil
  br label %._crit_edge.loopexit59

._crit_edge.loopexit59:                           ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29
  ret void

.lr.ph.split:                                     ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new
  %.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ]
  %78 = srem i32 %.047, %3
  %79 = sext i32 %78 to i64
  %80 = getelementptr inbounds float, float* %42, i64 %79
  %81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8
  %82 = add nuw nsw i32 %.047, 32768
  %83 = srem i32 %82, %3
  %84 = sext i32 %83 to i64
  %85 = getelementptr inbounds float, float* %42, i64 %84
  %86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8
  %87 = add nsw i32 %.047, 65536
  %88 = srem i32 %87, %3
  %89 = sext i32 %88 to i64
  %90 = getelementptr inbounds float, float* %42, i64 %89
  %91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8
  %92 = add nsw i32 %.047, 98304
  %93 = srem i32 %92, %3
  %94 = sext i32 %93 to i64
  %95 = getelementptr inbounds float, float* %42, i64 %94
  %96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8
  %97 = add nsw i32 %.047, 131072
  %niter.nsub.3 = add i32 %niter, -4
  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
  br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split

; <label>:98:                                     ; preds = %62
  %99 = mul nsw i32 %66, %3
  %100 = add nsw i32 %99, %49
  %101 = sext i32 %100 to i64
  %102 = getelementptr inbounds float, float* %40, i64 %101
  %103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8
  br label %104

; <label>:104:                                    ; preds = %98, %62
  %105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ]
  %106 = fadd float %64, %105
  %107 = add nsw i32 %.04346.us.us, 2
  %exitcond.1 = icmp eq i32 %107, 16
  br i1 %exitcond.1, label %.us-lcssa.us.us, label %53
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 128
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %2, 15
  %31 = sdiv i32 %30, 16
  %32 = mul nsw i32 %31, %3
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = shl nuw nsw i32 %33, 8
  %36 = add nuw nsw i32 %35, %34
  %37 = icmp slt i32 %36, %32
  br i1 %37, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
  %38 = icmp sgt i32 %3, -1
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
  %40 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
  br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader

.lr.ph.split.preheader:                           ; preds = %.lr.ph
  br label %.lr.ph.split

.lr.ph.split.us.preheader:                        ; preds = %.lr.ph
  br label %.lr.ph.split.us

.lr.ph.split.us:                                  ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
  %.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ]
  %41 = srem i32 %.048.us, %3
  %42 = sdiv i32 %.048.us, %3
  %43 = srem i32 %42, %31
  %44 = shl nsw i32 %43, 4
  %.idx45.val.us = load float, float* %.idx45, align 4
  %45 = load float*, float** %39, align 8
  br label %54

; <label>:46:                                     ; preds = %49, %.us-lcssa.us.us
  %.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ]
  %47 = bitcast i32 %.011.i.us to float
  %48 = fcmp olt float %47, %.lcssa
  br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us

; <label>:49:                                     ; preds = %46
  %50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst
  %51 = extractvalue { i32, i1 } %50, 0
  %not..i.us = icmp eq i32 %.011.i.us, %51
  br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46
  %52 = add nuw nsw i32 %.048.us, 32768
  %53 = icmp slt i32 %52, %32
  br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit

; <label>:54:                                     ; preds = %112, %.lr.ph.split.us
  %.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ]
  %.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ]
  %55 = add nuw nsw i32 %.04347.us.us, %44
  %56 = icmp slt i32 %55, %2
  br i1 %56, label %57, label %63

; <label>:57:                                     ; preds = %54
  %58 = mul nsw i32 %55, %3
  %59 = add nsw i32 %58, %41
  %60 = sext i32 %59 to i64
  %61 = getelementptr inbounds float, float* %45, i64 %60
  %62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8
  br label %63

; <label>:63:                                     ; preds = %54, %57
  %64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ]
  %65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8
  %66 = or i32 %.04347.us.us, 1
  %67 = add nuw nsw i32 %66, %44
  %68 = icmp slt i32 %67, %2
  br i1 %68, label %106, label %112

.us-lcssa.us.us:                                  ; preds = %112
  %.lcssa = phi float [ %114, %112 ]
  %69 = load float*, float** %40, align 8
  %70 = sext i32 %41 to i64
  %71 = getelementptr inbounds float, float* %69, i64 %70
  %72 = bitcast float %.lcssa to i32
  %73 = bitcast float* %71 to i32*
  %74 = load i32, i32* %73, align 4
  br label %46

._crit_edge.loopexit:                             ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
  br label %._crit_edge

._crit_edge.loopexit60:                           ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29
  ret void

.lr.ph.split:                                     ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  %.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ]
  %.idx45.val = load float, float* %.idx45, align 4
  %75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8
  %76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8
  %77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8
  %78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8
  %79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8
  %80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8
  %81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8
  %82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8
  %83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8
  %84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8
  %85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8
  %86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8
  %87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8
  %88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8
  %89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8
  %90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8
  %91 = srem i32 %.048, %3
  %92 = load float*, float** %40, align 8
  %93 = sext i32 %91 to i64
  %94 = getelementptr inbounds float, float* %92, i64 %93
  %95 = bitcast float %90 to i32
  %96 = bitcast float* %94 to i32*
  %97 = load i32, i32* %96, align 4
  br label %98

; <label>:98:                                     ; preds = %101, %.lr.ph.split
  %.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ]
  %99 = bitcast i32 %.011.i to float
  %100 = fcmp olt float %99, %90
  br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit

; <label>:101:                                    ; preds = %98
  %102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst
  %103 = extractvalue { i32, i1 } %102, 0
  %not..i = icmp eq i32 %.011.i, %103
  br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101
  %104 = add nuw nsw i32 %.048, 32768
  %105 = icmp slt i32 %104, %32
  br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60

; <label>:106:                                    ; preds = %63
  %107 = mul nsw i32 %67, %3
  %108 = add nsw i32 %107, %41
  %109 = sext i32 %108 to i64
  %110 = getelementptr inbounds float, float* %45, i64 %109
  %111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8
  br label %112

; <label>:112:                                    ; preds = %106, %63
  %113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ]
  %114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8
  %115 = add nsw i32 %.04347.us.us, 2
  %exitcond.1 = icmp eq i32 %115, 16
  br i1 %exitcond.1, label %.us-lcssa.us.us, label %54
}

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.6"* byval align 8, i32) #0 comdat {
  %3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %5 = mul nuw nsw i32 %4, %3
  %6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %7 = add nuw nsw i32 %5, %6
  %8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %9 = mul nuw nsw i32 %8, %4
  %.sroa.021.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %0, i64 0, i32 0, i32 0
  %.sroa.021.0.copyload = load float*, float** %.sroa.021.0..sroa_idx, align 8
  %.sroa.5.0..sroa_idx25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %0, i64 0, i32 1, i32 3
  %.sroa.5.0.copyload = load float*, float** %.sroa.5.0..sroa_idx25, align 8
  %10 = icmp slt i32 %7, %1
  br i1 %10, label %.lr.ph.i.preheader, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit

.lr.ph.i.preheader:                               ; preds = %2
  br label %.lr.ph.i

.lr.ph.i:                                         ; preds = %.lr.ph.i.preheader, %.lr.ph.i
  %.07.i = phi i32 [ %17, %.lr.ph.i ], [ %7, %.lr.ph.i.preheader ]
  %11 = sext i32 %.07.i to i64
  %12 = getelementptr inbounds float, float* %.sroa.5.0.copyload, i64 %11
  %13 = bitcast float* %12 to i32*
  %14 = load i32, i32* %13, align 4
  %15 = getelementptr inbounds float, float* %.sroa.021.0.copyload, i64 %11
  %16 = bitcast float* %15 to i32*
  store i32 %14, i32* %16, align 4
  %17 = add nsw i32 %.07.i, %9
  %18 = icmp slt i32 %17, %1
  br i1 %18, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit: ; preds = %.lr.ph.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit, %2
  ret void
}

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.11"* byval align 8, i32) #0 comdat {
  %3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %5 = mul nuw nsw i32 %4, %3
  %6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %7 = add nuw nsw i32 %5, %6
  %8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %9 = mul nuw nsw i32 %8, %4
  %.sroa.041.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 0, i32 0
  %.sroa.041.0.copyload = load float*, float** %.sroa.041.0..sroa_idx, align 8
  %.sroa.545.0..sroa_idx46 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 1, i32 7
  %.sroa.545.0..sroa_cast = bitcast i32* %.sroa.545.0..sroa_idx46 to i64*
  %.sroa.545.0.copyload = load i64, i64* %.sroa.545.0..sroa_cast, align 8
  %.sroa.648.0..sroa_idx49 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 1, i32 9, i32 0, i64 0
  %.sroa.648.0.copyload = load i32, i32* %.sroa.648.0..sroa_idx49, align 8
  %.sroa.8.0..sroa_idx53 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 1, i32 10, i32 0
  %.sroa.8.0.copyload = load float*, float** %.sroa.8.0..sroa_idx53, align 8
  %10 = icmp slt i32 %7, %1
  br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit

.lr.ph.i:                                         ; preds = %2
  %11 = trunc i64 %.sroa.545.0.copyload to i32
  %12 = icmp sgt i32 %.sroa.648.0.copyload, 0
  %13 = lshr i64 %.sroa.545.0.copyload, 32
  %14 = trunc i64 %13 to i32
  br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader

.lr.ph.split.i.preheader:                         ; preds = %.lr.ph.i
  br label %.lr.ph.split.i

.lr.ph.split.us.i.preheader:                      ; preds = %.lr.ph.i
  %15 = add i32 %.sroa.648.0.copyload, -1
  %xtraiter = and i32 %.sroa.648.0.copyload, 3
  %16 = icmp ult i32 %15, 3
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  %unroll_iter = sub i32 %.sroa.648.0.copyload, %xtraiter
  br label %.lr.ph.split.us.i

.lr.ph.split.us.i:                                ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
  %.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ]
  %17 = mul nsw i32 %.07.us.i, %11
  br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new

.lr.ph.split.us.i.new:                            ; preds = %.lr.ph.split.us.i
  br label %18

; <label>:18:                                     ; preds = %18, %.lr.ph.split.us.i.new
  %19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ]
  %.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ]
  %20 = mul nsw i32 %.012.i.i.i.us.i, %14
  %21 = add nsw i32 %20, %17
  %22 = sext i32 %21 to i64
  %23 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %22
  %24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8
  %25 = fadd float %19, %24
  %26 = or i32 %.012.i.i.i.us.i, 1
  %27 = mul nsw i32 %26, %14
  %28 = add nsw i32 %27, %17
  %29 = sext i32 %28 to i64
  %30 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %29
  %31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8
  %32 = fadd float %25, %31
  %33 = or i32 %.012.i.i.i.us.i, 2
  %34 = mul nsw i32 %33, %14
  %35 = add nsw i32 %34, %17
  %36 = sext i32 %35 to i64
  %37 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %36
  %38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8
  %39 = fadd float %32, %38
  %40 = or i32 %.012.i.i.i.us.i, 3
  %41 = mul nsw i32 %40, %14
  %42 = add nsw i32 %41, %17
  %43 = sext i32 %42 to i64
  %44 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %43
  %45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8
  %46 = fadd float %39, %45
  %47 = add nsw i32 %.012.i.i.i.us.i, 4
  %niter.nsub.3 = add i32 %niter, -4
  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
  br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18

_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18
  %.lcssa67 = phi i32 [ %47, %18 ]
  %.lcssa66 = phi float [ %46, %18 ]
  br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa

_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i
  %.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  %.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  %.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader

.epil.preheader:                                  ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
  br label %48

; <label>:48:                                     ; preds = %48, %.epil.preheader
  %49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ]
  %.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ]
  %epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ]
  %50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14
  %51 = add nsw i32 %50, %17
  %52 = sext i32 %51 to i64
  %53 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %52
  %54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
  %55 = fadd float %49, %54
  %56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !61

_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48
  %.lcssa68 = phi float [ %55, %48 ]
  br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i

_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa
  %.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa68, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ]
  %57 = sext i32 %.07.us.i to i64
  %58 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %57
  store float %.lcssa, float* %58, align 4
  %59 = add nsw i32 %.07.us.i, %9
  %60 = icmp slt i32 %59, %1
  br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit

.lr.ph.split.i:                                   ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i
  %.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ]
  %61 = sext i32 %.07.i to i64
  %62 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %61
  store float 0.000000e+00, float* %62, align 4
  %63 = add nsw i32 %.07.i, %9
  %64 = icmp slt i32 %63, %1
  br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64: ; preds = %.lr.ph.split.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit, %2
  ret void
}

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat {
  %4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %6 = mul nuw nsw i32 %5, %4
  %7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %8 = add nuw nsw i32 %6, %7
  %9 = icmp slt i32 %8, %1
  br i1 %9, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %3
  %10 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %2, i64 0, i32 0
  %11 = load float*, float** %10, align 8
  %12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %13 = mul nuw nsw i32 %12, %5
  br label %14

._crit_edge.loopexit:                             ; preds = %14
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %3
  ret void

; <label>:14:                                     ; preds = %.lr.ph, %14
  %.08 = phi i32 [ %8, %.lr.ph ], [ %17, %14 ]
  %15 = sext i32 %.08 to i64
  %16 = getelementptr inbounds float, float* %11, i64 %15
  store float %0, float* %16, align 4
  %17 = add i32 %13, %.08
  %18 = icmp slt i32 %17, %1
  br i1 %18, label %14, label %._crit_edge.loopexit
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 32
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %3, 32767
  %31 = sdiv i32 %30, 32768
  %32 = mul nsw i32 %31, %2
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = icmp slt i32 %33, %32
  br i1 %35, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %36 = and i32 %34, 31
  %.not = icmp ne i32 %36, 0
  %37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0
  %38 = load float*, float** %37, align 8
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
  %40 = load float*, float** %39, align 8
  br label %41

._crit_edge.loopexit:                             ; preds = %187
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %29
  ret void

; <label>:41:                                     ; preds = %.lr.ph, %187
  %.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ]
  %42 = srem i32 %.0114, %31
  %43 = sdiv i32 %.0114, %31
  %44 = shl nsw i32 %42, 15
  %45 = or i32 %44, %34
  %46 = icmp slt i32 %43, %2
  br i1 %46, label %.preheader102, label %.thread.preheader

.thread.preheader.loopexit:                       ; preds = %.preheader.preheader
  %.lcssa137 = phi float [ %164, %.preheader.preheader ]
  br label %.thread.preheader

.thread.preheader:                                ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41
  %.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
  br label %.thread

.preheader102:                                    ; preds = %41
  %47 = mul nsw i32 %43, %3
  %48 = add i32 %47, %45
  br label %49

; <label>:49:                                     ; preds = %.preheader102, %.preheader.preheader
  %.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ]
  %.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ]
  %50 = add nuw nsw i32 %.098108, 16
  %51 = shl i32 %.098108, 8
  %52 = or i32 %51, 3840
  %53 = add nsw i32 %52, %45
  %54 = icmp slt i32 %53, %3
  br i1 %54, label %.preheader.preheader, label %.preheader101

.preheader.preheader:                             ; preds = %49
  %55 = add i32 %48, %51
  %56 = sext i32 %55 to i64
  %57 = getelementptr inbounds float, float* %40, i64 %56
  %58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8
  %59 = fadd float %.095109, %58
  %60 = shl i32 %.098108, 8
  %61 = or i32 %60, 256
  %62 = add i32 %48, %61
  %63 = sext i32 %62 to i64
  %64 = getelementptr inbounds float, float* %40, i64 %63
  %65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8
  %66 = fadd float %59, %65
  %67 = shl i32 %.098108, 8
  %68 = or i32 %67, 512
  %69 = add i32 %48, %68
  %70 = sext i32 %69 to i64
  %71 = getelementptr inbounds float, float* %40, i64 %70
  %72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
  %73 = fadd float %66, %72
  %74 = shl i32 %.098108, 8
  %75 = or i32 %74, 768
  %76 = add i32 %48, %75
  %77 = sext i32 %76 to i64
  %78 = getelementptr inbounds float, float* %40, i64 %77
  %79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8
  %80 = fadd float %73, %79
  %81 = shl i32 %.098108, 8
  %82 = or i32 %81, 1024
  %83 = add i32 %48, %82
  %84 = sext i32 %83 to i64
  %85 = getelementptr inbounds float, float* %40, i64 %84
  %86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8
  %87 = fadd float %80, %86
  %88 = shl i32 %.098108, 8
  %89 = or i32 %88, 1280
  %90 = add i32 %48, %89
  %91 = sext i32 %90 to i64
  %92 = getelementptr inbounds float, float* %40, i64 %91
  %93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8
  %94 = fadd float %87, %93
  %95 = shl i32 %.098108, 8
  %96 = or i32 %95, 1536
  %97 = add i32 %48, %96
  %98 = sext i32 %97 to i64
  %99 = getelementptr inbounds float, float* %40, i64 %98
  %100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8
  %101 = fadd float %94, %100
  %102 = shl i32 %.098108, 8
  %103 = or i32 %102, 1792
  %104 = add i32 %48, %103
  %105 = sext i32 %104 to i64
  %106 = getelementptr inbounds float, float* %40, i64 %105
  %107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8
  %108 = fadd float %101, %107
  %109 = shl i32 %.098108, 8
  %110 = or i32 %109, 2048
  %111 = add i32 %48, %110
  %112 = sext i32 %111 to i64
  %113 = getelementptr inbounds float, float* %40, i64 %112
  %114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8
  %115 = fadd float %108, %114
  %116 = shl i32 %.098108, 8
  %117 = or i32 %116, 2304
  %118 = add i32 %48, %117
  %119 = sext i32 %118 to i64
  %120 = getelementptr inbounds float, float* %40, i64 %119
  %121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8
  %122 = fadd float %115, %121
  %123 = shl i32 %.098108, 8
  %124 = or i32 %123, 2560
  %125 = add i32 %48, %124
  %126 = sext i32 %125 to i64
  %127 = getelementptr inbounds float, float* %40, i64 %126
  %128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8
  %129 = fadd float %122, %128
  %130 = shl i32 %.098108, 8
  %131 = or i32 %130, 2816
  %132 = add i32 %48, %131
  %133 = sext i32 %132 to i64
  %134 = getelementptr inbounds float, float* %40, i64 %133
  %135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8
  %136 = fadd float %129, %135
  %137 = shl i32 %.098108, 8
  %138 = or i32 %137, 3072
  %139 = add i32 %48, %138
  %140 = sext i32 %139 to i64
  %141 = getelementptr inbounds float, float* %40, i64 %140
  %142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8
  %143 = fadd float %136, %142
  %144 = shl i32 %.098108, 8
  %145 = or i32 %144, 3328
  %146 = add i32 %48, %145
  %147 = sext i32 %146 to i64
  %148 = getelementptr inbounds float, float* %40, i64 %147
  %149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8
  %150 = fadd float %143, %149
  %151 = shl i32 %.098108, 8
  %152 = or i32 %151, 3584
  %153 = add i32 %48, %152
  %154 = sext i32 %153 to i64
  %155 = getelementptr inbounds float, float* %40, i64 %154
  %156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8
  %157 = fadd float %150, %156
  %158 = shl i32 %.098108, 8
  %159 = or i32 %158, 3840
  %160 = add i32 %48, %159
  %161 = sext i32 %160 to i64
  %162 = getelementptr inbounds float, float* %40, i64 %161
  %163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
  %164 = fadd float %157, %163
  %165 = icmp slt i32 %50, 128
  br i1 %165, label %49, label %.thread.preheader.loopexit

.preheader101:                                    ; preds = %49
  %.lcssa = phi i32 [ %51, %49 ]
  %.098108.lcssa = phi i32 [ %.098108, %49 ]
  %.095109.lcssa = phi float [ %.095109, %49 ]
  %166 = add nsw i32 %.lcssa, %45
  %167 = icmp slt i32 %166, %3
  br i1 %167, label %168, label %.thread.preheader

; <label>:168:                                    ; preds = %.preheader101
  %169 = add nsw i32 %166, %47
  %170 = sext i32 %169 to i64
  %171 = getelementptr inbounds float, float* %40, i64 %170
  %172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
  %173 = fadd float %.095109.lcssa, %172
  %174 = shl i32 %.098108.lcssa, 8
  %175 = or i32 %174, 256
  %176 = add nsw i32 %175, %45
  %177 = icmp slt i32 %176, %3
  br i1 %177, label %190, label %.thread.preheader

; <label>:178:                                    ; preds = %.thread
  %.lcssa138 = phi float [ %180, %.thread ]
  %.not99 = xor i1 %46, true
  %brmerge = or i1 %.not, %.not99
  br i1 %brmerge, label %187, label %183

.thread:                                          ; preds = %.thread.preheader, %.thread
  %.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
  %.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
  %179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
  %180 = fadd float %.8112, %179
  %181 = lshr i32 %.092113, 1
  %182 = icmp eq i32 %181, 0
  br i1 %182, label %178, label %.thread, !llvm.loop !62

; <label>:183:                                    ; preds = %178
  %184 = sext i32 %43 to i64
  %185 = getelementptr inbounds float, float* %38, i64 %184
  %186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8
  br label %187

; <label>:187:                                    ; preds = %178, %183
  %188 = add nuw nsw i32 %.0114, 32
  %189 = icmp slt i32 %188, %32
  br i1 %189, label %41, label %._crit_edge.loopexit

; <label>:190:                                    ; preds = %168
  %191 = add nsw i32 %176, %47
  %192 = sext i32 %191 to i64
  %193 = getelementptr inbounds float, float* %40, i64 %192
  %194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8
  %195 = fadd float %173, %194
  %196 = shl i32 %.098108.lcssa, 8
  %197 = or i32 %196, 512
  %198 = add nsw i32 %197, %45
  %199 = icmp slt i32 %198, %3
  br i1 %199, label %200, label %.thread.preheader

; <label>:200:                                    ; preds = %190
  %201 = add nsw i32 %198, %47
  %202 = sext i32 %201 to i64
  %203 = getelementptr inbounds float, float* %40, i64 %202
  %204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8
  %205 = fadd float %195, %204
  %206 = shl i32 %.098108.lcssa, 8
  %207 = or i32 %206, 768
  %208 = add nsw i32 %207, %45
  %209 = icmp slt i32 %208, %3
  br i1 %209, label %210, label %.thread.preheader

; <label>:210:                                    ; preds = %200
  %211 = add nsw i32 %208, %47
  %212 = sext i32 %211 to i64
  %213 = getelementptr inbounds float, float* %40, i64 %212
  %214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8
  %215 = fadd float %205, %214
  %216 = shl i32 %.098108.lcssa, 8
  %217 = or i32 %216, 1024
  %218 = add nsw i32 %217, %45
  %219 = icmp slt i32 %218, %3
  br i1 %219, label %220, label %.thread.preheader

; <label>:220:                                    ; preds = %210
  %221 = add nsw i32 %218, %47
  %222 = sext i32 %221 to i64
  %223 = getelementptr inbounds float, float* %40, i64 %222
  %224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8
  %225 = fadd float %215, %224
  %226 = shl i32 %.098108.lcssa, 8
  %227 = or i32 %226, 1280
  %228 = add nsw i32 %227, %45
  %229 = icmp slt i32 %228, %3
  br i1 %229, label %230, label %.thread.preheader

; <label>:230:                                    ; preds = %220
  %231 = add nsw i32 %228, %47
  %232 = sext i32 %231 to i64
  %233 = getelementptr inbounds float, float* %40, i64 %232
  %234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8
  %235 = fadd float %225, %234
  %236 = shl i32 %.098108.lcssa, 8
  %237 = or i32 %236, 1536
  %238 = add nsw i32 %237, %45
  %239 = icmp slt i32 %238, %3
  br i1 %239, label %240, label %.thread.preheader

; <label>:240:                                    ; preds = %230
  %241 = add nsw i32 %238, %47
  %242 = sext i32 %241 to i64
  %243 = getelementptr inbounds float, float* %40, i64 %242
  %244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8
  %245 = fadd float %235, %244
  %246 = shl i32 %.098108.lcssa, 8
  %247 = or i32 %246, 1792
  %248 = add nsw i32 %247, %45
  %249 = icmp slt i32 %248, %3
  br i1 %249, label %250, label %.thread.preheader

; <label>:250:                                    ; preds = %240
  %251 = add nsw i32 %248, %47
  %252 = sext i32 %251 to i64
  %253 = getelementptr inbounds float, float* %40, i64 %252
  %254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8
  %255 = fadd float %245, %254
  %256 = shl i32 %.098108.lcssa, 8
  %257 = or i32 %256, 2048
  %258 = add nsw i32 %257, %45
  %259 = icmp slt i32 %258, %3
  br i1 %259, label %260, label %.thread.preheader

; <label>:260:                                    ; preds = %250
  %261 = add nsw i32 %258, %47
  %262 = sext i32 %261 to i64
  %263 = getelementptr inbounds float, float* %40, i64 %262
  %264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8
  %265 = fadd float %255, %264
  %266 = shl i32 %.098108.lcssa, 8
  %267 = or i32 %266, 2304
  %268 = add nsw i32 %267, %45
  %269 = icmp slt i32 %268, %3
  br i1 %269, label %270, label %.thread.preheader

; <label>:270:                                    ; preds = %260
  %271 = add nsw i32 %268, %47
  %272 = sext i32 %271 to i64
  %273 = getelementptr inbounds float, float* %40, i64 %272
  %274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8
  %275 = fadd float %265, %274
  %276 = shl i32 %.098108.lcssa, 8
  %277 = or i32 %276, 2560
  %278 = add nsw i32 %277, %45
  %279 = icmp slt i32 %278, %3
  br i1 %279, label %280, label %.thread.preheader

; <label>:280:                                    ; preds = %270
  %281 = add nsw i32 %278, %47
  %282 = sext i32 %281 to i64
  %283 = getelementptr inbounds float, float* %40, i64 %282
  %284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8
  %285 = fadd float %275, %284
  %286 = shl i32 %.098108.lcssa, 8
  %287 = or i32 %286, 2816
  %288 = add nsw i32 %287, %45
  %289 = icmp slt i32 %288, %3
  br i1 %289, label %290, label %.thread.preheader

; <label>:290:                                    ; preds = %280
  %291 = add nsw i32 %288, %47
  %292 = sext i32 %291 to i64
  %293 = getelementptr inbounds float, float* %40, i64 %292
  %294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8
  %295 = fadd float %285, %294
  %296 = shl i32 %.098108.lcssa, 8
  %297 = or i32 %296, 3072
  %298 = add nsw i32 %297, %45
  %299 = icmp slt i32 %298, %3
  br i1 %299, label %300, label %.thread.preheader

; <label>:300:                                    ; preds = %290
  %301 = add nsw i32 %298, %47
  %302 = sext i32 %301 to i64
  %303 = getelementptr inbounds float, float* %40, i64 %302
  %304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8
  %305 = fadd float %295, %304
  %306 = shl i32 %.098108.lcssa, 8
  %307 = or i32 %306, 3328
  %308 = add nsw i32 %307, %45
  %309 = icmp slt i32 %308, %3
  br i1 %309, label %310, label %.thread.preheader

; <label>:310:                                    ; preds = %300
  %311 = add nsw i32 %308, %47
  %312 = sext i32 %311 to i64
  %313 = getelementptr inbounds float, float* %40, i64 %312
  %314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8
  %315 = fadd float %305, %314
  %316 = shl i32 %.098108.lcssa, 8
  %317 = or i32 %316, 3584
  %318 = add nsw i32 %317, %45
  %319 = icmp slt i32 %318, %3
  br i1 %319, label %320, label %.thread.preheader

; <label>:320:                                    ; preds = %310
  %321 = add nsw i32 %318, %47
  %322 = sext i32 %321 to i64
  %323 = getelementptr inbounds float, float* %40, i64 %322
  %324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8
  %325 = fadd float %315, %324
  br label %.thread.preheader
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 32
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %3, 32767
  %31 = sdiv i32 %30, 32768
  %32 = mul nsw i32 %31, %2
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = icmp slt i32 %33, %32
  br i1 %35, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
  %36 = and i32 %34, 31
  %.not = icmp ne i32 %36, 0
  %37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0
  %38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
  br label %39

._crit_edge.loopexit:                             ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %29
  ret void

; <label>:39:                                     ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  %.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ]
  %40 = srem i32 %.0114, %31
  %41 = sdiv i32 %.0114, %31
  %42 = shl nsw i32 %40, 15
  %43 = or i32 %42, %34
  %.idx.val = load float, float* %.idx, align 4
  %44 = icmp slt i32 %41, %2
  br i1 %44, label %.preheader102, label %.thread.preheader

.thread.preheader.loopexit:                       ; preds = %.preheader.preheader
  %.lcssa137 = phi float [ %163, %.preheader.preheader ]
  br label %.thread.preheader

.thread.preheader:                                ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39
  %.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
  br label %.thread

.preheader102:                                    ; preds = %39
  %45 = mul nsw i32 %41, %3
  %46 = add i32 %45, %43
  %47 = load float*, float** %38, align 8
  br label %48

; <label>:48:                                     ; preds = %.preheader102, %.preheader.preheader
  %.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ]
  %.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ]
  %49 = add nuw nsw i32 %.098108, 16
  %50 = shl i32 %.098108, 8
  %51 = or i32 %50, 3840
  %52 = add nsw i32 %51, %43
  %53 = icmp slt i32 %52, %3
  br i1 %53, label %.preheader.preheader, label %.preheader101

.preheader.preheader:                             ; preds = %48
  %54 = add i32 %46, %50
  %55 = sext i32 %54 to i64
  %56 = getelementptr inbounds float, float* %47, i64 %55
  %57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8
  %58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8
  %59 = shl i32 %.098108, 8
  %60 = or i32 %59, 256
  %61 = add i32 %46, %60
  %62 = sext i32 %61 to i64
  %63 = getelementptr inbounds float, float* %47, i64 %62
  %64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8
  %65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8
  %66 = shl i32 %.098108, 8
  %67 = or i32 %66, 512
  %68 = add i32 %46, %67
  %69 = sext i32 %68 to i64
  %70 = getelementptr inbounds float, float* %47, i64 %69
  %71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8
  %72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8
  %73 = shl i32 %.098108, 8
  %74 = or i32 %73, 768
  %75 = add i32 %46, %74
  %76 = sext i32 %75 to i64
  %77 = getelementptr inbounds float, float* %47, i64 %76
  %78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
  %79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8
  %80 = shl i32 %.098108, 8
  %81 = or i32 %80, 1024
  %82 = add i32 %46, %81
  %83 = sext i32 %82 to i64
  %84 = getelementptr inbounds float, float* %47, i64 %83
  %85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8
  %86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8
  %87 = shl i32 %.098108, 8
  %88 = or i32 %87, 1280
  %89 = add i32 %46, %88
  %90 = sext i32 %89 to i64
  %91 = getelementptr inbounds float, float* %47, i64 %90
  %92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8
  %93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8
  %94 = shl i32 %.098108, 8
  %95 = or i32 %94, 1536
  %96 = add i32 %46, %95
  %97 = sext i32 %96 to i64
  %98 = getelementptr inbounds float, float* %47, i64 %97
  %99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8
  %100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8
  %101 = shl i32 %.098108, 8
  %102 = or i32 %101, 1792
  %103 = add i32 %46, %102
  %104 = sext i32 %103 to i64
  %105 = getelementptr inbounds float, float* %47, i64 %104
  %106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8
  %107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8
  %108 = shl i32 %.098108, 8
  %109 = or i32 %108, 2048
  %110 = add i32 %46, %109
  %111 = sext i32 %110 to i64
  %112 = getelementptr inbounds float, float* %47, i64 %111
  %113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8
  %114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8
  %115 = shl i32 %.098108, 8
  %116 = or i32 %115, 2304
  %117 = add i32 %46, %116
  %118 = sext i32 %117 to i64
  %119 = getelementptr inbounds float, float* %47, i64 %118
  %120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8
  %121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8
  %122 = shl i32 %.098108, 8
  %123 = or i32 %122, 2560
  %124 = add i32 %46, %123
  %125 = sext i32 %124 to i64
  %126 = getelementptr inbounds float, float* %47, i64 %125
  %127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8
  %128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8
  %129 = shl i32 %.098108, 8
  %130 = or i32 %129, 2816
  %131 = add i32 %46, %130
  %132 = sext i32 %131 to i64
  %133 = getelementptr inbounds float, float* %47, i64 %132
  %134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8
  %135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8
  %136 = shl i32 %.098108, 8
  %137 = or i32 %136, 3072
  %138 = add i32 %46, %137
  %139 = sext i32 %138 to i64
  %140 = getelementptr inbounds float, float* %47, i64 %139
  %141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8
  %142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8
  %143 = shl i32 %.098108, 8
  %144 = or i32 %143, 3328
  %145 = add i32 %46, %144
  %146 = sext i32 %145 to i64
  %147 = getelementptr inbounds float, float* %47, i64 %146
  %148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8
  %149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8
  %150 = shl i32 %.098108, 8
  %151 = or i32 %150, 3584
  %152 = add i32 %46, %151
  %153 = sext i32 %152 to i64
  %154 = getelementptr inbounds float, float* %47, i64 %153
  %155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8
  %156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8
  %157 = shl i32 %.098108, 8
  %158 = or i32 %157, 3840
  %159 = add i32 %46, %158
  %160 = sext i32 %159 to i64
  %161 = getelementptr inbounds float, float* %47, i64 %160
  %162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8
  %163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8
  %164 = icmp slt i32 %49, 128
  br i1 %164, label %48, label %.thread.preheader.loopexit

.preheader101:                                    ; preds = %48
  %.lcssa = phi i32 [ %50, %48 ]
  %.098108.lcssa = phi i32 [ %.098108, %48 ]
  %.095109.lcssa = phi float [ %.095109, %48 ]
  %165 = load float*, float** %38, align 8
  %166 = add nsw i32 %.lcssa, %43
  %167 = icmp slt i32 %166, %3
  br i1 %167, label %168, label %.thread.preheader

; <label>:168:                                    ; preds = %.preheader101
  %169 = add nsw i32 %166, %45
  %170 = sext i32 %169 to i64
  %171 = getelementptr inbounds float, float* %165, i64 %170
  %172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
  %173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8
  %174 = shl i32 %.098108.lcssa, 8
  %175 = or i32 %174, 256
  %176 = add nsw i32 %175, %43
  %177 = icmp slt i32 %176, %3
  br i1 %177, label %198, label %.thread.preheader

; <label>:178:                                    ; preds = %.thread
  %.lcssa138 = phi float [ %180, %.thread ]
  %.not99 = xor i1 %44, true
  %brmerge = or i1 %.not, %.not99
  br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183

.thread:                                          ; preds = %.thread.preheader, %.thread
  %.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
  %.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
  %179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
  %180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8
  %181 = lshr i32 %.092113, 1
  %182 = icmp eq i32 %181, 0
  br i1 %182, label %178, label %.thread, !llvm.loop !63

; <label>:183:                                    ; preds = %178
  %184 = sext i32 %41 to i64
  %185 = load float*, float** %37, align 8
  %186 = getelementptr inbounds float, float* %185, i64 %184
  %187 = bitcast float %.lcssa138 to i32
  %188 = bitcast float* %186 to i32*
  %189 = load i32, i32* %188, align 4
  br label %190

; <label>:190:                                    ; preds = %193, %183
  %.011.i = phi i32 [ %189, %183 ], [ %195, %193 ]
  %191 = bitcast i32 %.011.i to float
  %192 = fcmp olt float %191, %.lcssa138
  br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit

; <label>:193:                                    ; preds = %190
  %194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst
  %195 = extractvalue { i32, i1 } %194, 0
  %not..i = icmp eq i32 %.011.i, %195
  br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193
  br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178
  %196 = add nuw nsw i32 %.0114, 32
  %197 = icmp slt i32 %196, %32
  br i1 %197, label %39, label %._crit_edge.loopexit

; <label>:198:                                    ; preds = %168
  %199 = add nsw i32 %176, %45
  %200 = sext i32 %199 to i64
  %201 = getelementptr inbounds float, float* %165, i64 %200
  %202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8
  %203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8
  %204 = shl i32 %.098108.lcssa, 8
  %205 = or i32 %204, 512
  %206 = add nsw i32 %205, %43
  %207 = icmp slt i32 %206, %3
  br i1 %207, label %208, label %.thread.preheader

; <label>:208:                                    ; preds = %198
  %209 = add nsw i32 %206, %45
  %210 = sext i32 %209 to i64
  %211 = getelementptr inbounds float, float* %165, i64 %210
  %212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8
  %213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8
  %214 = shl i32 %.098108.lcssa, 8
  %215 = or i32 %214, 768
  %216 = add nsw i32 %215, %43
  %217 = icmp slt i32 %216, %3
  br i1 %217, label %218, label %.thread.preheader

; <label>:218:                                    ; preds = %208
  %219 = add nsw i32 %216, %45
  %220 = sext i32 %219 to i64
  %221 = getelementptr inbounds float, float* %165, i64 %220
  %222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8
  %223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8
  %224 = shl i32 %.098108.lcssa, 8
  %225 = or i32 %224, 1024
  %226 = add nsw i32 %225, %43
  %227 = icmp slt i32 %226, %3
  br i1 %227, label %228, label %.thread.preheader

; <label>:228:                                    ; preds = %218
  %229 = add nsw i32 %226, %45
  %230 = sext i32 %229 to i64
  %231 = getelementptr inbounds float, float* %165, i64 %230
  %232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8
  %233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8
  %234 = shl i32 %.098108.lcssa, 8
  %235 = or i32 %234, 1280
  %236 = add nsw i32 %235, %43
  %237 = icmp slt i32 %236, %3
  br i1 %237, label %238, label %.thread.preheader

; <label>:238:                                    ; preds = %228
  %239 = add nsw i32 %236, %45
  %240 = sext i32 %239 to i64
  %241 = getelementptr inbounds float, float* %165, i64 %240
  %242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8
  %243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8
  %244 = shl i32 %.098108.lcssa, 8
  %245 = or i32 %244, 1536
  %246 = add nsw i32 %245, %43
  %247 = icmp slt i32 %246, %3
  br i1 %247, label %248, label %.thread.preheader

; <label>:248:                                    ; preds = %238
  %249 = add nsw i32 %246, %45
  %250 = sext i32 %249 to i64
  %251 = getelementptr inbounds float, float* %165, i64 %250
  %252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8
  %253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8
  %254 = shl i32 %.098108.lcssa, 8
  %255 = or i32 %254, 1792
  %256 = add nsw i32 %255, %43
  %257 = icmp slt i32 %256, %3
  br i1 %257, label %258, label %.thread.preheader

; <label>:258:                                    ; preds = %248
  %259 = add nsw i32 %256, %45
  %260 = sext i32 %259 to i64
  %261 = getelementptr inbounds float, float* %165, i64 %260
  %262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8
  %263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8
  %264 = shl i32 %.098108.lcssa, 8
  %265 = or i32 %264, 2048
  %266 = add nsw i32 %265, %43
  %267 = icmp slt i32 %266, %3
  br i1 %267, label %268, label %.thread.preheader

; <label>:268:                                    ; preds = %258
  %269 = add nsw i32 %266, %45
  %270 = sext i32 %269 to i64
  %271 = getelementptr inbounds float, float* %165, i64 %270
  %272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8
  %273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8
  %274 = shl i32 %.098108.lcssa, 8
  %275 = or i32 %274, 2304
  %276 = add nsw i32 %275, %43
  %277 = icmp slt i32 %276, %3
  br i1 %277, label %278, label %.thread.preheader

; <label>:278:                                    ; preds = %268
  %279 = add nsw i32 %276, %45
  %280 = sext i32 %279 to i64
  %281 = getelementptr inbounds float, float* %165, i64 %280
  %282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8
  %283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8
  %284 = shl i32 %.098108.lcssa, 8
  %285 = or i32 %284, 2560
  %286 = add nsw i32 %285, %43
  %287 = icmp slt i32 %286, %3
  br i1 %287, label %288, label %.thread.preheader

; <label>:288:                                    ; preds = %278
  %289 = add nsw i32 %286, %45
  %290 = sext i32 %289 to i64
  %291 = getelementptr inbounds float, float* %165, i64 %290
  %292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8
  %293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8
  %294 = shl i32 %.098108.lcssa, 8
  %295 = or i32 %294, 2816
  %296 = add nsw i32 %295, %43
  %297 = icmp slt i32 %296, %3
  br i1 %297, label %298, label %.thread.preheader

; <label>:298:                                    ; preds = %288
  %299 = add nsw i32 %296, %45
  %300 = sext i32 %299 to i64
  %301 = getelementptr inbounds float, float* %165, i64 %300
  %302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8
  %303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8
  %304 = shl i32 %.098108.lcssa, 8
  %305 = or i32 %304, 3072
  %306 = add nsw i32 %305, %43
  %307 = icmp slt i32 %306, %3
  br i1 %307, label %308, label %.thread.preheader

; <label>:308:                                    ; preds = %298
  %309 = add nsw i32 %306, %45
  %310 = sext i32 %309 to i64
  %311 = getelementptr inbounds float, float* %165, i64 %310
  %312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8
  %313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8
  %314 = shl i32 %.098108.lcssa, 8
  %315 = or i32 %314, 3328
  %316 = add nsw i32 %315, %43
  %317 = icmp slt i32 %316, %3
  br i1 %317, label %318, label %.thread.preheader

; <label>:318:                                    ; preds = %308
  %319 = add nsw i32 %316, %45
  %320 = sext i32 %319 to i64
  %321 = getelementptr inbounds float, float* %165, i64 %320
  %322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8
  %323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8
  %324 = shl i32 %.098108.lcssa, 8
  %325 = or i32 %324, 3584
  %326 = add nsw i32 %325, %43
  %327 = icmp slt i32 %326, %3
  br i1 %327, label %328, label %.thread.preheader

; <label>:328:                                    ; preds = %318
  %329 = add nsw i32 %326, %45
  %330 = sext i32 %329 to i64
  %331 = getelementptr inbounds float, float* %165, i64 %330
  %332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8
  %333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8
  br label %.thread.preheader
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 128
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %2, 15
  %31 = sdiv i32 %30, 16
  %32 = mul nsw i32 %31, %3
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = shl nuw nsw i32 %33, 8
  %36 = add nuw nsw i32 %35, %34
  %37 = icmp slt i32 %36, %32
  br i1 %37, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %38 = icmp sgt i32 %3, -1
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
  %40 = load float*, float** %39, align 8
  %41 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0
  %42 = load float*, float** %41, align 8
  br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader

.lr.ph.split.preheader:                           ; preds = %.lr.ph
  %43 = add i32 %32, -1
  %44 = sub i32 %43, %34
  %45 = sub i32 %44, %35
  %46 = lshr i32 %45, 15
  %47 = add nuw nsw i32 %46, 1
  %xtraiter = and i32 %47, 3
  %48 = icmp ult i32 %45, 98304
  br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new

.lr.ph.split.preheader.new:                       ; preds = %.lr.ph.split.preheader
  %unroll_iter = sub nsw i32 %47, %xtraiter
  br label %.lr.ph.split

.lr.ph.split.us.preheader:                        ; preds = %.lr.ph
  br label %.lr.ph.split.us

.lr.ph.split.us:                                  ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us
  %.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ]
  %49 = srem i32 %.047.us, %3
  %50 = sdiv i32 %.047.us, %3
  %51 = srem i32 %50, %31
  %52 = shl nsw i32 %51, 4
  br label %53

; <label>:53:                                     ; preds = %104, %.lr.ph.split.us
  %.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ]
  %.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ]
  %54 = add nuw nsw i32 %.04346.us.us, %52
  %55 = icmp slt i32 %54, %2
  br i1 %55, label %56, label %62

; <label>:56:                                     ; preds = %53
  %57 = mul nsw i32 %54, %3
  %58 = add nsw i32 %57, %49
  %59 = sext i32 %58 to i64
  %60 = getelementptr inbounds float, float* %40, i64 %59
  %61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8
  br label %62

; <label>:62:                                     ; preds = %56, %53
  %63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ]
  %64 = fadd float %.04445.us.us, %63
  %65 = or i32 %.04346.us.us, 1
  %66 = add nuw nsw i32 %65, %52
  %67 = icmp slt i32 %66, %2
  br i1 %67, label %98, label %104

.us-lcssa.us.us:                                  ; preds = %104
  %.lcssa = phi float [ %106, %104 ]
  %68 = sext i32 %49 to i64
  %69 = getelementptr inbounds float, float* %42, i64 %68
  %70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8
  %71 = add nuw nsw i32 %.047.us, 32768
  %72 = icmp slt i32 %71, %32
  br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit

._crit_edge.loopexit:                             ; preds = %.us-lcssa.us.us
  br label %._crit_edge

._crit_edge.loopexit59.unr-lcssa.loopexit:        ; preds = %.lr.ph.split
  %.lcssa60 = phi i32 [ %97, %.lr.ph.split ]
  br label %._crit_edge.loopexit59.unr-lcssa

._crit_edge.loopexit59.unr-lcssa:                 ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader
  %.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ]
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader

.lr.ph.split.epil.preheader:                      ; preds = %._crit_edge.loopexit59.unr-lcssa
  br label %.lr.ph.split.epil

.lr.ph.split.epil:                                ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader
  %.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ]
  %epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ]
  %73 = srem i32 %.047.epil, %3
  %74 = sext i32 %73 to i64
  %75 = getelementptr inbounds float, float* %42, i64 %74
  %76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8
  %77 = add nuw nsw i32 %.047.epil, 32768
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !64

._crit_edge.loopexit59.epilog-lcssa:              ; preds = %.lr.ph.split.epil
  br label %._crit_edge.loopexit59

._crit_edge.loopexit59:                           ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29
  ret void

.lr.ph.split:                                     ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new
  %.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ]
  %78 = srem i32 %.047, %3
  %79 = sext i32 %78 to i64
  %80 = getelementptr inbounds float, float* %42, i64 %79
  %81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8
  %82 = add nuw nsw i32 %.047, 32768
  %83 = srem i32 %82, %3
  %84 = sext i32 %83 to i64
  %85 = getelementptr inbounds float, float* %42, i64 %84
  %86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8
  %87 = add nsw i32 %.047, 65536
  %88 = srem i32 %87, %3
  %89 = sext i32 %88 to i64
  %90 = getelementptr inbounds float, float* %42, i64 %89
  %91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8
  %92 = add nsw i32 %.047, 98304
  %93 = srem i32 %92, %3
  %94 = sext i32 %93 to i64
  %95 = getelementptr inbounds float, float* %42, i64 %94
  %96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8
  %97 = add nsw i32 %.047, 131072
  %niter.nsub.3 = add i32 %niter, -4
  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
  br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split

; <label>:98:                                     ; preds = %62
  %99 = mul nsw i32 %66, %3
  %100 = add nsw i32 %99, %49
  %101 = sext i32 %100 to i64
  %102 = getelementptr inbounds float, float* %40, i64 %101
  %103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8
  br label %104

; <label>:104:                                    ; preds = %98, %62
  %105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ]
  %106 = fadd float %64, %105
  %107 = add nsw i32 %.04346.us.us, 2
  %exitcond.1 = icmp eq i32 %107, 16
  br i1 %exitcond.1, label %.us-lcssa.us.us, label %53
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 128
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %2, 15
  %31 = sdiv i32 %30, 16
  %32 = mul nsw i32 %31, %3
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = shl nuw nsw i32 %33, 8
  %36 = add nuw nsw i32 %35, %34
  %37 = icmp slt i32 %36, %32
  br i1 %37, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
  %38 = icmp sgt i32 %3, -1
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
  %40 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0
  br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader

.lr.ph.split.preheader:                           ; preds = %.lr.ph
  br label %.lr.ph.split

.lr.ph.split.us.preheader:                        ; preds = %.lr.ph
  br label %.lr.ph.split.us

.lr.ph.split.us:                                  ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
  %.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ]
  %41 = srem i32 %.048.us, %3
  %42 = sdiv i32 %.048.us, %3
  %43 = srem i32 %42, %31
  %44 = shl nsw i32 %43, 4
  %.idx45.val.us = load float, float* %.idx45, align 4
  %45 = load float*, float** %39, align 8
  br label %54

; <label>:46:                                     ; preds = %49, %.us-lcssa.us.us
  %.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ]
  %47 = bitcast i32 %.011.i.us to float
  %48 = fcmp olt float %47, %.lcssa
  br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us

; <label>:49:                                     ; preds = %46
  %50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst
  %51 = extractvalue { i32, i1 } %50, 0
  %not..i.us = icmp eq i32 %.011.i.us, %51
  br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46
  %52 = add nuw nsw i32 %.048.us, 32768
  %53 = icmp slt i32 %52, %32
  br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit

; <label>:54:                                     ; preds = %112, %.lr.ph.split.us
  %.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ]
  %.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ]
  %55 = add nuw nsw i32 %.04347.us.us, %44
  %56 = icmp slt i32 %55, %2
  br i1 %56, label %57, label %63

; <label>:57:                                     ; preds = %54
  %58 = mul nsw i32 %55, %3
  %59 = add nsw i32 %58, %41
  %60 = sext i32 %59 to i64
  %61 = getelementptr inbounds float, float* %45, i64 %60
  %62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8
  br label %63

; <label>:63:                                     ; preds = %54, %57
  %64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ]
  %65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8
  %66 = or i32 %.04347.us.us, 1
  %67 = add nuw nsw i32 %66, %44
  %68 = icmp slt i32 %67, %2
  br i1 %68, label %106, label %112

.us-lcssa.us.us:                                  ; preds = %112
  %.lcssa = phi float [ %114, %112 ]
  %69 = sext i32 %41 to i64
  %70 = load float*, float** %40, align 8
  %71 = getelementptr inbounds float, float* %70, i64 %69
  %72 = bitcast float %.lcssa to i32
  %73 = bitcast float* %71 to i32*
  %74 = load i32, i32* %73, align 4
  br label %46

._crit_edge.loopexit:                             ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
  br label %._crit_edge

._crit_edge.loopexit60:                           ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29
  ret void

.lr.ph.split:                                     ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  %.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ]
  %.idx45.val = load float, float* %.idx45, align 4
  %75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8
  %76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8
  %77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8
  %78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8
  %79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8
  %80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8
  %81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8
  %82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8
  %83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8
  %84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8
  %85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8
  %86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8
  %87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8
  %88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8
  %89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8
  %90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8
  %91 = srem i32 %.048, %3
  %92 = sext i32 %91 to i64
  %93 = load float*, float** %40, align 8
  %94 = getelementptr inbounds float, float* %93, i64 %92
  %95 = bitcast float %90 to i32
  %96 = bitcast float* %94 to i32*
  %97 = load i32, i32* %96, align 4
  br label %98

; <label>:98:                                     ; preds = %101, %.lr.ph.split
  %.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ]
  %99 = bitcast i32 %.011.i to float
  %100 = fcmp olt float %99, %90
  br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit

; <label>:101:                                    ; preds = %98
  %102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst
  %103 = extractvalue { i32, i1 } %102, 0
  %not..i = icmp eq i32 %.011.i, %103
  br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101
  %104 = add nuw nsw i32 %.048, 32768
  %105 = icmp slt i32 %104, %32
  br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60

; <label>:106:                                    ; preds = %63
  %107 = mul nsw i32 %67, %3
  %108 = add nsw i32 %107, %41
  %109 = sext i32 %108 to i64
  %110 = getelementptr inbounds float, float* %45, i64 %109
  %111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8
  br label %112

; <label>:112:                                    ; preds = %106, %63
  %113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ]
  %114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8
  %115 = add nsw i32 %.04347.us.us, 2
  %exitcond.1 = icmp eq i32 %115, 16
  br i1 %exitcond.1, label %.us-lcssa.us.us, label %54
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, float*) #2 comdat {
  %5 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %6 = shl nuw nsw i32 %5, 15
  %7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %8 = or i32 %6, %7
  %9 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %10 = icmp eq i32 %9, 1
  br i1 %10, label %11, label %15

; <label>:11:                                     ; preds = %4
  %12 = icmp eq i32 %8, 0
  br i1 %12, label %13, label %14

; <label>:13:                                     ; preds = %11
  store float 0.000000e+00, float* %3, align 4
  br label %14

; <label>:14:                                     ; preds = %13, %11
  tail call void @llvm.cuda.syncthreads()
  br label %15

; <label>:15:                                     ; preds = %14, %4
  %16 = sub nsw i32 %2, %8
  %17 = icmp sgt i32 %16, 32768
  %..i = select i1 %17, i32 32768, i32 %16
  %18 = icmp sgt i32 %16, 0
  br i1 %18, label %.lr.ph, label %.preheader.preheader

.preheader.preheader.loopexit:                    ; preds = %.epil.preheader
  %.lcssa47 = phi float [ %23, %.epil.preheader ]
  br label %.preheader.preheader

.preheader.preheader:                             ; preds = %.preheader.preheader.loopexit, %.preheader.preheader.loopexit.unr-lcssa, %15
  %.132.ph = phi float [ 0.000000e+00, %15 ], [ %.lcssa36.ph, %.preheader.preheader.loopexit.unr-lcssa ], [ %.lcssa47, %.preheader.preheader.loopexit ]
  br label %.preheader

.preheader.preheader.loopexit.unr-lcssa.loopexit: ; preds = %32
  %.lcssa49 = phi i32 [ %80, %32 ]
  %.lcssa48 = phi float [ %79, %32 ]
  br label %.preheader.preheader.loopexit.unr-lcssa

.preheader.preheader.loopexit.unr-lcssa:          ; preds = %.preheader.preheader.loopexit.unr-lcssa.loopexit, %.lr.ph
  %.lcssa36.ph = phi float [ undef, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
  %.02535.unr = phi i32 [ 0, %.lr.ph ], [ %.lcssa49, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
  %.03134.unr = phi float [ 0.000000e+00, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod, label %.preheader.preheader, label %.epil.preheader.preheader

.epil.preheader.preheader:                        ; preds = %.preheader.preheader.loopexit.unr-lcssa
  br label %.epil.preheader

.epil.preheader:                                  ; preds = %.epil.preheader.preheader, %.epil.preheader
  %.02535.epil = phi i32 [ %24, %.epil.preheader ], [ %.02535.unr, %.epil.preheader.preheader ]
  %.03134.epil = phi float [ %23, %.epil.preheader ], [ %.03134.unr, %.epil.preheader.preheader ]
  %epil.iter = phi i32 [ %epil.iter.sub, %.epil.preheader ], [ %xtraiter, %.epil.preheader.preheader ]
  %19 = add nuw nsw i32 %.02535.epil, %8
  %20 = sext i32 %19 to i64
  %21 = getelementptr inbounds float, float* %26, i64 %20
  %22 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %21, i32 4) #8
  %23 = fadd float %.03134.epil, %22
  %24 = add nuw nsw i32 %.02535.epil, 256
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %.preheader.preheader.loopexit, label %.epil.preheader, !llvm.loop !65

.lr.ph:                                           ; preds = %15
  %25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i64 0, i32 10, i32 0
  %26 = load float*, float** %25, align 8
  %27 = icmp sgt i32 %..i, 256
  %smax = select i1 %27, i32 %..i, i32 256
  %28 = add i32 %smax, -1
  %29 = lshr i32 %28, 8
  %30 = add nuw nsw i32 %29, 1
  %xtraiter = and i32 %30, 7
  %31 = icmp ult i32 %28, 1792
  br i1 %31, label %.preheader.preheader.loopexit.unr-lcssa, label %.lr.ph.new

.lr.ph.new:                                       ; preds = %.lr.ph
  %unroll_iter = sub nsw i32 %30, %xtraiter
  br label %32

; <label>:32:                                     ; preds = %32, %.lr.ph.new
  %.02535 = phi i32 [ 0, %.lr.ph.new ], [ %80, %32 ]
  %.03134 = phi float [ 0.000000e+00, %.lr.ph.new ], [ %79, %32 ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.new ], [ %niter.nsub.7, %32 ]
  %33 = add nuw nsw i32 %.02535, %8
  %34 = sext i32 %33 to i64
  %35 = getelementptr inbounds float, float* %26, i64 %34
  %36 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %35, i32 4) #8
  %37 = fadd float %.03134, %36
  %38 = or i32 %.02535, 256
  %39 = add nuw nsw i32 %38, %8
  %40 = sext i32 %39 to i64
  %41 = getelementptr inbounds float, float* %26, i64 %40
  %42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8
  %43 = fadd float %37, %42
  %44 = or i32 %.02535, 512
  %45 = add nuw nsw i32 %44, %8
  %46 = sext i32 %45 to i64
  %47 = getelementptr inbounds float, float* %26, i64 %46
  %48 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %47, i32 4) #8
  %49 = fadd float %43, %48
  %50 = or i32 %.02535, 768
  %51 = add nuw nsw i32 %50, %8
  %52 = sext i32 %51 to i64
  %53 = getelementptr inbounds float, float* %26, i64 %52
  %54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
  %55 = fadd float %49, %54
  %56 = or i32 %.02535, 1024
  %57 = add nuw nsw i32 %56, %8
  %58 = sext i32 %57 to i64
  %59 = getelementptr inbounds float, float* %26, i64 %58
  %60 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %59, i32 4) #8
  %61 = fadd float %55, %60
  %62 = or i32 %.02535, 1280
  %63 = add nuw nsw i32 %62, %8
  %64 = sext i32 %63 to i64
  %65 = getelementptr inbounds float, float* %26, i64 %64
  %66 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %65, i32 4) #8
  %67 = fadd float %61, %66
  %68 = or i32 %.02535, 1536
  %69 = add nuw nsw i32 %68, %8
  %70 = sext i32 %69 to i64
  %71 = getelementptr inbounds float, float* %26, i64 %70
  %72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
  %73 = fadd float %67, %72
  %74 = or i32 %.02535, 1792
  %75 = add nuw nsw i32 %74, %8
  %76 = sext i32 %75 to i64
  %77 = getelementptr inbounds float, float* %26, i64 %76
  %78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
  %79 = fadd float %73, %78
  %80 = add nsw i32 %.02535, 2048
  %niter.nsub.7 = add i32 %niter, -8
  %niter.ncmp.7 = icmp eq i32 %niter.nsub.7, 0
  br i1 %niter.ncmp.7, label %.preheader.preheader.loopexit.unr-lcssa.loopexit, label %32, !llvm.loop !66

; <label>:81:                                     ; preds = %.preheader
  %.lcssa = phi float [ %85, %.preheader ]
  %82 = and i32 %7, 31
  %83 = icmp eq i32 %82, 0
  br i1 %83, label %88, label %90

.preheader:                                       ; preds = %.preheader.preheader, %.preheader
  %.033 = phi i32 [ %86, %.preheader ], [ 16, %.preheader.preheader ]
  %.132 = phi float [ %85, %.preheader ], [ %.132.ph, %.preheader.preheader ]
  %84 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.132, i32 %.033, i32 31) #3, !srcloc !53
  %85 = fadd float %.132, %84
  %86 = lshr i32 %.033, 1
  %87 = icmp eq i32 %86, 0
  br i1 %87, label %81, label %.preheader, !llvm.loop !67

; <label>:88:                                     ; preds = %81
  %89 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %3, float %.lcssa) #8
  br label %90

; <label>:90:                                     ; preds = %88, %81
  ret void
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = shl nuw nsw i32 %6, 7
  %8 = add i32 %2, -1
  %9 = add i32 %8, %7
  %10 = udiv i32 %9, %7
  %11 = mul nsw i32 %10, %3
  %12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %13 = mul nuw nsw i32 %12, %6
  %14 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %15 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %16 = icmp eq i32 %12, 1
  br i1 %16, label %22, label %.preheader94

.preheader94.loopexit:                            ; preds = %.lr.ph109
  br label %.preheader94

.preheader94:                                     ; preds = %.preheader94.loopexit, %22, %5
  %17 = icmp slt i32 %14, %11
  br i1 %17, label %.lr.ph106, label %._crit_edge

.lr.ph106:                                        ; preds = %.preheader94
  %18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i64 0, i32 10, i32 0
  %19 = load float*, float** %18, align 8
  %20 = and i32 %15, 31
  %21 = icmp eq i32 %20, 0
  br label %30

; <label>:22:                                     ; preds = %5
  %23 = mul nuw nsw i32 %14, %6
  %24 = add nuw nsw i32 %23, %15
  %25 = icmp slt i32 %24, %3
  br i1 %25, label %.lr.ph109.preheader, label %.preheader94

.lr.ph109.preheader:                              ; preds = %22
  br label %.lr.ph109

.lr.ph109:                                        ; preds = %.lr.ph109.preheader, %.lr.ph109
  %.081107 = phi i32 [ %28, %.lr.ph109 ], [ %24, %.lr.ph109.preheader ]
  %26 = sext i32 %.081107 to i64
  %27 = getelementptr inbounds float, float* %4, i64 %26
  store float 0.000000e+00, float* %27, align 4
  %28 = add nsw i32 %.081107, %13
  %29 = icmp slt i32 %28, %3
  br i1 %29, label %.lr.ph109, label %.preheader94.loopexit

._crit_edge.loopexit:                             ; preds = %177
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %.preheader94
  ret void

; <label>:30:                                     ; preds = %.lr.ph106, %177
  %.083105 = phi i32 [ %14, %.lr.ph106 ], [ %178, %177 ]
  %31 = sdiv i32 %.083105, %10
  %32 = icmp slt i32 %31, %3
  br i1 %32, label %33, label %177

; <label>:33:                                     ; preds = %30
  %34 = srem i32 %.083105, %10
  %35 = mul i32 %7, %34
  %36 = add i32 %35, %15
  %37 = mul nsw i32 %31, %2
  %38 = add i32 %36, %37
  br label %39

; <label>:39:                                     ; preds = %33, %.preheader.preheader
  %.086100 = phi i32 [ 0, %33 ], [ %40, %.preheader.preheader ]
  %.09299 = phi float [ 0.000000e+00, %33 ], [ %155, %.preheader.preheader ]
  %40 = add nuw nsw i32 %.086100, 16
  %41 = or i32 %.086100, 15
  %42 = mul i32 %41, %6
  %43 = add i32 %42, %36
  %44 = icmp slt i32 %43, %2
  %45 = mul i32 %.086100, %6
  br i1 %44, label %.preheader.preheader, label %157

.preheader.preheader:                             ; preds = %39
  %46 = add i32 %38, %45
  %47 = sext i32 %46 to i64
  %48 = getelementptr inbounds float, float* %19, i64 %47
  %49 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %48, i32 4) #8
  %50 = fadd float %.09299, %49
  %51 = or i32 %.086100, 1
  %52 = mul i32 %51, %6
  %53 = add i32 %38, %52
  %54 = sext i32 %53 to i64
  %55 = getelementptr inbounds float, float* %19, i64 %54
  %56 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %55, i32 4) #8
  %57 = fadd float %50, %56
  %58 = or i32 %.086100, 2
  %59 = mul i32 %58, %6
  %60 = add i32 %38, %59
  %61 = sext i32 %60 to i64
  %62 = getelementptr inbounds float, float* %19, i64 %61
  %63 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %62, i32 4) #8
  %64 = fadd float %57, %63
  %65 = or i32 %.086100, 3
  %66 = mul i32 %65, %6
  %67 = add i32 %38, %66
  %68 = sext i32 %67 to i64
  %69 = getelementptr inbounds float, float* %19, i64 %68
  %70 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %69, i32 4) #8
  %71 = fadd float %64, %70
  %72 = or i32 %.086100, 4
  %73 = mul i32 %72, %6
  %74 = add i32 %38, %73
  %75 = sext i32 %74 to i64
  %76 = getelementptr inbounds float, float* %19, i64 %75
  %77 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %76, i32 4) #8
  %78 = fadd float %71, %77
  %79 = or i32 %.086100, 5
  %80 = mul i32 %79, %6
  %81 = add i32 %38, %80
  %82 = sext i32 %81 to i64
  %83 = getelementptr inbounds float, float* %19, i64 %82
  %84 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %83, i32 4) #8
  %85 = fadd float %78, %84
  %86 = or i32 %.086100, 6
  %87 = mul i32 %86, %6
  %88 = add i32 %38, %87
  %89 = sext i32 %88 to i64
  %90 = getelementptr inbounds float, float* %19, i64 %89
  %91 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %90, i32 4) #8
  %92 = fadd float %85, %91
  %93 = or i32 %.086100, 7
  %94 = mul i32 %93, %6
  %95 = add i32 %38, %94
  %96 = sext i32 %95 to i64
  %97 = getelementptr inbounds float, float* %19, i64 %96
  %98 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %97, i32 4) #8
  %99 = fadd float %92, %98
  %100 = or i32 %.086100, 8
  %101 = mul i32 %100, %6
  %102 = add i32 %38, %101
  %103 = sext i32 %102 to i64
  %104 = getelementptr inbounds float, float* %19, i64 %103
  %105 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %104, i32 4) #8
  %106 = fadd float %99, %105
  %107 = or i32 %.086100, 9
  %108 = mul i32 %107, %6
  %109 = add i32 %38, %108
  %110 = sext i32 %109 to i64
  %111 = getelementptr inbounds float, float* %19, i64 %110
  %112 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %111, i32 4) #8
  %113 = fadd float %106, %112
  %114 = or i32 %.086100, 10
  %115 = mul i32 %114, %6
  %116 = add i32 %38, %115
  %117 = sext i32 %116 to i64
  %118 = getelementptr inbounds float, float* %19, i64 %117
  %119 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %118, i32 4) #8
  %120 = fadd float %113, %119
  %121 = or i32 %.086100, 11
  %122 = mul i32 %121, %6
  %123 = add i32 %38, %122
  %124 = sext i32 %123 to i64
  %125 = getelementptr inbounds float, float* %19, i64 %124
  %126 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %125, i32 4) #8
  %127 = fadd float %120, %126
  %128 = or i32 %.086100, 12
  %129 = mul i32 %128, %6
  %130 = add i32 %38, %129
  %131 = sext i32 %130 to i64
  %132 = getelementptr inbounds float, float* %19, i64 %131
  %133 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %132, i32 4) #8
  %134 = fadd float %127, %133
  %135 = or i32 %.086100, 13
  %136 = mul i32 %135, %6
  %137 = add i32 %38, %136
  %138 = sext i32 %137 to i64
  %139 = getelementptr inbounds float, float* %19, i64 %138
  %140 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %139, i32 4) #8
  %141 = fadd float %134, %140
  %142 = or i32 %.086100, 14
  %143 = mul i32 %142, %6
  %144 = add i32 %38, %143
  %145 = sext i32 %144 to i64
  %146 = getelementptr inbounds float, float* %19, i64 %145
  %147 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %146, i32 4) #8
  %148 = fadd float %141, %147
  %149 = or i32 %.086100, 15
  %150 = mul i32 %149, %6
  %151 = add i32 %38, %150
  %152 = sext i32 %151 to i64
  %153 = getelementptr inbounds float, float* %19, i64 %152
  %154 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %153, i32 4) #8
  %155 = fadd float %148, %154
  %156 = icmp slt i32 %40, 128
  br i1 %156, label %39, label %.critedge.loopexit125

; <label>:157:                                    ; preds = %39
  %.lcssa = phi i32 [ %45, %39 ]
  %.09299.lcssa = phi float [ %.09299, %39 ]
  %158 = add i32 %.lcssa, %36
  %159 = icmp slt i32 %158, %2
  br i1 %159, label %.lr.ph.preheader, label %.critedge

.lr.ph.preheader:                                 ; preds = %157
  br label %.lr.ph

.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
  %.084102 = phi i32 [ %165, %.lr.ph ], [ %158, %.lr.ph.preheader ]
  %.1101 = phi float [ %164, %.lr.ph ], [ %.09299.lcssa, %.lr.ph.preheader ]
  %160 = add nsw i32 %.084102, %37
  %161 = sext i32 %160 to i64
  %162 = getelementptr inbounds float, float* %19, i64 %161
  %163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
  %164 = fadd float %.1101, %163
  %165 = add i32 %.084102, %6
  %166 = icmp slt i32 %165, %2
  br i1 %166, label %.lr.ph, label %.critedge.loopexit

.critedge.loopexit:                               ; preds = %.lr.ph
  %.lcssa134 = phi float [ %164, %.lr.ph ]
  br label %.critedge

.critedge.loopexit125:                            ; preds = %.preheader.preheader
  %.lcssa133 = phi float [ %155, %.preheader.preheader ]
  br label %.critedge

.critedge:                                        ; preds = %.critedge.loopexit125, %.critedge.loopexit, %157
  %.3 = phi float [ %.09299.lcssa, %157 ], [ %.lcssa134, %.critedge.loopexit ], [ %.lcssa133, %.critedge.loopexit125 ]
  tail call void @llvm.cuda.syncthreads()
  br label %168

; <label>:167:                                    ; preds = %168
  %.lcssa135 = phi float [ %170, %168 ]
  br i1 %21, label %173, label %177

; <label>:168:                                    ; preds = %.critedge, %168
  %.0104 = phi i32 [ 16, %.critedge ], [ %171, %168 ]
  %.4103 = phi float [ %.3, %.critedge ], [ %170, %168 ]
  %169 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.4103, i32 %.0104, i32 31) #3, !srcloc !53
  %170 = fadd float %.4103, %169
  %171 = lshr i32 %.0104, 1
  %172 = icmp eq i32 %171, 0
  br i1 %172, label %167, label %168

; <label>:173:                                    ; preds = %167
  %174 = sext i32 %31 to i64
  %175 = getelementptr inbounds float, float* %4, i64 %174
  %176 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %175, float %.lcssa135) #8
  br label %177

; <label>:177:                                    ; preds = %167, %173, %30
  tail call void @llvm.cuda.syncthreads()
  %178 = add i32 %.083105, %12
  %179 = icmp slt i32 %178, %11
  br i1 %179, label %30, label %._crit_edge.loopexit
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %8 = mul nuw nsw i32 %7, %6
  %9 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %10 = mul nuw nsw i32 %9, %6
  %11 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %12 = add nuw nsw i32 %10, %11
  %13 = icmp eq i32 %7, 1
  br i1 %13, label %.preheader, label %19

.preheader:                                       ; preds = %5
  %14 = icmp slt i32 %12, %3
  br i1 %14, label %.lr.ph60.preheader, label %._crit_edge61

.lr.ph60.preheader:                               ; preds = %.preheader
  br label %.lr.ph60

._crit_edge61.loopexit:                           ; preds = %.lr.ph60
  br label %._crit_edge61

._crit_edge61:                                    ; preds = %._crit_edge61.loopexit, %.preheader
  tail call void @llvm.cuda.syncthreads()
  br label %19

.lr.ph60:                                         ; preds = %.lr.ph60.preheader, %.lr.ph60
  %.059 = phi i32 [ %17, %.lr.ph60 ], [ %12, %.lr.ph60.preheader ]
  %15 = sext i32 %.059 to i64
  %16 = getelementptr inbounds float, float* %4, i64 %15
  store float 0.000000e+00, float* %16, align 4
  %17 = add nsw i32 %.059, %8
  %18 = icmp slt i32 %17, %3
  br i1 %18, label %.lr.ph60, label %._crit_edge61.loopexit

; <label>:19:                                     ; preds = %._crit_edge61, %5
  %20 = add i32 %2, 15
  %21 = sdiv i32 %20, 16
  %22 = mul nsw i32 %21, %3
  %23 = icmp slt i32 %12, %22
  br i1 %23, label %.lr.ph57, label %._crit_edge58

.lr.ph57:                                         ; preds = %19
  %24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i64 0, i32 10, i32 0
  %25 = load float*, float** %24, align 8
  br label %26

._crit_edge58.loopexit:                           ; preds = %._crit_edge
  br label %._crit_edge58

._crit_edge58:                                    ; preds = %._crit_edge58.loopexit, %19
  ret void

; <label>:26:                                     ; preds = %.lr.ph57, %._crit_edge
  %.04755 = phi i32 [ %12, %.lr.ph57 ], [ %36, %._crit_edge ]
  %27 = srem i32 %.04755, %3
  %28 = sdiv i32 %.04755, %3
  %29 = shl nsw i32 %28, 4
  %30 = add nsw i32 %29, 16
  %31 = icmp sgt i32 %30, %2
  %..i = select i1 %31, i32 %2, i32 %30
  %32 = icmp slt i32 %29, %..i
  br i1 %32, label %.lr.ph.preheader, label %._crit_edge

.lr.ph.preheader:                                 ; preds = %26
  br label %.lr.ph

._crit_edge.loopexit:                             ; preds = %.lr.ph
  %.lcssa = phi float [ %43, %.lr.ph ]
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %26
  %.052.lcssa = phi float [ 0.000000e+00, %26 ], [ %.lcssa, %._crit_edge.loopexit ]
  %33 = sext i32 %27 to i64
  %34 = getelementptr inbounds float, float* %4, i64 %33
  %35 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %34, float %.052.lcssa) #8
  %36 = add nsw i32 %.04755, %8
  %37 = icmp slt i32 %36, %22
  br i1 %37, label %26, label %._crit_edge58.loopexit

.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
  %.04654 = phi i32 [ %44, %.lr.ph ], [ %29, %.lr.ph.preheader ]
  %.05253 = phi float [ %43, %.lr.ph ], [ 0.000000e+00, %.lr.ph.preheader ]
  %38 = mul nsw i32 %.04654, %3
  %39 = add nsw i32 %38, %27
  %40 = sext i32 %39 to i64
  %41 = getelementptr inbounds float, float* %25, i64 %40
  %42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8
  %43 = fadd float %.05253, %42
  %44 = add nsw i32 %.04654, 1
  %45 = icmp slt i32 %44, %..i
  br i1 %45, label %.lr.ph, label %._crit_edge.loopexit
}

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.14"* byval align 8, i32) #0 comdat {
  %3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %5 = mul nuw nsw i32 %4, %3
  %6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %7 = add nuw nsw i32 %5, %6
  %8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %9 = mul nuw nsw i32 %8, %4
  %.sroa.444.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 0, i32 7
  %.sroa.444.0..sroa_cast = bitcast i32* %.sroa.444.0..sroa_idx to i64*
  %.sroa.444.0.copyload = load i64, i64* %.sroa.444.0..sroa_cast, align 8
  %.sroa.546.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 0, i32 9, i32 0, i64 0
  %.sroa.546.0.copyload = load i32, i32* %.sroa.546.0..sroa_idx, align 8
  %.sroa.750.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 0, i32 10, i32 0
  %.sroa.750.0.copyload = load float*, float** %.sroa.750.0..sroa_idx, align 8
  %.sroa.9.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 2
  %.sroa.9.0.copyload = load float*, float** %.sroa.9.0..sroa_idx, align 8
  %10 = icmp slt i32 %7, %1
  br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit

.lr.ph.i:                                         ; preds = %2
  %11 = trunc i64 %.sroa.444.0.copyload to i32
  %12 = icmp sgt i32 %.sroa.546.0.copyload, 0
  %13 = lshr i64 %.sroa.444.0.copyload, 32
  %14 = trunc i64 %13 to i32
  br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader

.lr.ph.split.i.preheader:                         ; preds = %.lr.ph.i
  br label %.lr.ph.split.i

.lr.ph.split.us.i.preheader:                      ; preds = %.lr.ph.i
  %15 = add i32 %.sroa.546.0.copyload, -1
  %xtraiter = and i32 %.sroa.546.0.copyload, 3
  %16 = icmp ult i32 %15, 3
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  %unroll_iter = sub i32 %.sroa.546.0.copyload, %xtraiter
  br label %.lr.ph.split.us.i

.lr.ph.split.us.i:                                ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
  %.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ]
  %17 = mul nsw i32 %.07.us.i, %11
  br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new

.lr.ph.split.us.i.new:                            ; preds = %.lr.ph.split.us.i
  br label %18

; <label>:18:                                     ; preds = %18, %.lr.ph.split.us.i.new
  %19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ]
  %.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ]
  %20 = mul nsw i32 %.012.i.i.i.us.i, %14
  %21 = add nsw i32 %20, %17
  %22 = sext i32 %21 to i64
  %23 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %22
  %24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8
  %25 = fadd float %19, %24
  %26 = or i32 %.012.i.i.i.us.i, 1
  %27 = mul nsw i32 %26, %14
  %28 = add nsw i32 %27, %17
  %29 = sext i32 %28 to i64
  %30 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %29
  %31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8
  %32 = fadd float %25, %31
  %33 = or i32 %.012.i.i.i.us.i, 2
  %34 = mul nsw i32 %33, %14
  %35 = add nsw i32 %34, %17
  %36 = sext i32 %35 to i64
  %37 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %36
  %38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8
  %39 = fadd float %32, %38
  %40 = or i32 %.012.i.i.i.us.i, 3
  %41 = mul nsw i32 %40, %14
  %42 = add nsw i32 %41, %17
  %43 = sext i32 %42 to i64
  %44 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %43
  %45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8
  %46 = fadd float %39, %45
  %47 = add nsw i32 %.012.i.i.i.us.i, 4
  %niter.nsub.3 = add i32 %niter, -4
  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
  br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18

_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18
  %.lcssa66 = phi i32 [ %47, %18 ]
  %.lcssa65 = phi float [ %46, %18 ]
  br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa

_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i
  %.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  %.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  %.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader

.epil.preheader:                                  ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
  br label %48

; <label>:48:                                     ; preds = %48, %.epil.preheader
  %49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ]
  %.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ]
  %epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ]
  %50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14
  %51 = add nsw i32 %50, %17
  %52 = sext i32 %51 to i64
  %53 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %52
  %54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
  %55 = fadd float %49, %54
  %56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !68

_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48
  %.lcssa67 = phi float [ %55, %48 ]
  br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i

_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa
  %.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ]
  %57 = sext i32 %.07.us.i to i64
  %58 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %57
  store float %.lcssa, float* %58, align 4
  %59 = add nsw i32 %.07.us.i, %9
  %60 = icmp slt i32 %59, %1
  br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit

.lr.ph.split.i:                                   ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i
  %.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ]
  %61 = sext i32 %.07.i to i64
  %62 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %61
  store float 0.000000e+00, float* %62, align 4
  %63 = add nsw i32 %.07.i, %9
  %64 = icmp slt i32 %63, %1
  br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63: ; preds = %.lr.ph.split.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit, %2
  ret void
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 32
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %3, 32767
  %31 = sdiv i32 %30, 32768
  %32 = mul nsw i32 %31, %2
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = icmp slt i32 %33, %32
  br i1 %35, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %36 = and i32 %34, 31
  %.not = icmp ne i32 %36, 0
  %37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
  %38 = load float*, float** %37, align 8
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
  %40 = load float*, float** %39, align 8
  br label %41

._crit_edge.loopexit:                             ; preds = %187
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %29
  ret void

; <label>:41:                                     ; preds = %.lr.ph, %187
  %.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ]
  %42 = srem i32 %.0114, %31
  %43 = sdiv i32 %.0114, %31
  %44 = shl nsw i32 %42, 15
  %45 = or i32 %44, %34
  %46 = icmp slt i32 %43, %2
  br i1 %46, label %.preheader102, label %.thread.preheader

.thread.preheader.loopexit:                       ; preds = %.preheader.preheader
  %.lcssa137 = phi float [ %164, %.preheader.preheader ]
  br label %.thread.preheader

.thread.preheader:                                ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41
  %.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
  br label %.thread

.preheader102:                                    ; preds = %41
  %47 = mul nsw i32 %43, %3
  %48 = add i32 %47, %45
  br label %49

; <label>:49:                                     ; preds = %.preheader102, %.preheader.preheader
  %.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ]
  %.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ]
  %50 = add nuw nsw i32 %.098108, 16
  %51 = shl i32 %.098108, 8
  %52 = or i32 %51, 3840
  %53 = add nsw i32 %52, %45
  %54 = icmp slt i32 %53, %3
  br i1 %54, label %.preheader.preheader, label %.preheader101

.preheader.preheader:                             ; preds = %49
  %55 = add i32 %48, %51
  %56 = sext i32 %55 to i64
  %57 = getelementptr inbounds float, float* %40, i64 %56
  %58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8
  %59 = fadd float %.095109, %58
  %60 = shl i32 %.098108, 8
  %61 = or i32 %60, 256
  %62 = add i32 %48, %61
  %63 = sext i32 %62 to i64
  %64 = getelementptr inbounds float, float* %40, i64 %63
  %65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8
  %66 = fadd float %59, %65
  %67 = shl i32 %.098108, 8
  %68 = or i32 %67, 512
  %69 = add i32 %48, %68
  %70 = sext i32 %69 to i64
  %71 = getelementptr inbounds float, float* %40, i64 %70
  %72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
  %73 = fadd float %66, %72
  %74 = shl i32 %.098108, 8
  %75 = or i32 %74, 768
  %76 = add i32 %48, %75
  %77 = sext i32 %76 to i64
  %78 = getelementptr inbounds float, float* %40, i64 %77
  %79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8
  %80 = fadd float %73, %79
  %81 = shl i32 %.098108, 8
  %82 = or i32 %81, 1024
  %83 = add i32 %48, %82
  %84 = sext i32 %83 to i64
  %85 = getelementptr inbounds float, float* %40, i64 %84
  %86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8
  %87 = fadd float %80, %86
  %88 = shl i32 %.098108, 8
  %89 = or i32 %88, 1280
  %90 = add i32 %48, %89
  %91 = sext i32 %90 to i64
  %92 = getelementptr inbounds float, float* %40, i64 %91
  %93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8
  %94 = fadd float %87, %93
  %95 = shl i32 %.098108, 8
  %96 = or i32 %95, 1536
  %97 = add i32 %48, %96
  %98 = sext i32 %97 to i64
  %99 = getelementptr inbounds float, float* %40, i64 %98
  %100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8
  %101 = fadd float %94, %100
  %102 = shl i32 %.098108, 8
  %103 = or i32 %102, 1792
  %104 = add i32 %48, %103
  %105 = sext i32 %104 to i64
  %106 = getelementptr inbounds float, float* %40, i64 %105
  %107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8
  %108 = fadd float %101, %107
  %109 = shl i32 %.098108, 8
  %110 = or i32 %109, 2048
  %111 = add i32 %48, %110
  %112 = sext i32 %111 to i64
  %113 = getelementptr inbounds float, float* %40, i64 %112
  %114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8
  %115 = fadd float %108, %114
  %116 = shl i32 %.098108, 8
  %117 = or i32 %116, 2304
  %118 = add i32 %48, %117
  %119 = sext i32 %118 to i64
  %120 = getelementptr inbounds float, float* %40, i64 %119
  %121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8
  %122 = fadd float %115, %121
  %123 = shl i32 %.098108, 8
  %124 = or i32 %123, 2560
  %125 = add i32 %48, %124
  %126 = sext i32 %125 to i64
  %127 = getelementptr inbounds float, float* %40, i64 %126
  %128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8
  %129 = fadd float %122, %128
  %130 = shl i32 %.098108, 8
  %131 = or i32 %130, 2816
  %132 = add i32 %48, %131
  %133 = sext i32 %132 to i64
  %134 = getelementptr inbounds float, float* %40, i64 %133
  %135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8
  %136 = fadd float %129, %135
  %137 = shl i32 %.098108, 8
  %138 = or i32 %137, 3072
  %139 = add i32 %48, %138
  %140 = sext i32 %139 to i64
  %141 = getelementptr inbounds float, float* %40, i64 %140
  %142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8
  %143 = fadd float %136, %142
  %144 = shl i32 %.098108, 8
  %145 = or i32 %144, 3328
  %146 = add i32 %48, %145
  %147 = sext i32 %146 to i64
  %148 = getelementptr inbounds float, float* %40, i64 %147
  %149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8
  %150 = fadd float %143, %149
  %151 = shl i32 %.098108, 8
  %152 = or i32 %151, 3584
  %153 = add i32 %48, %152
  %154 = sext i32 %153 to i64
  %155 = getelementptr inbounds float, float* %40, i64 %154
  %156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8
  %157 = fadd float %150, %156
  %158 = shl i32 %.098108, 8
  %159 = or i32 %158, 3840
  %160 = add i32 %48, %159
  %161 = sext i32 %160 to i64
  %162 = getelementptr inbounds float, float* %40, i64 %161
  %163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
  %164 = fadd float %157, %163
  %165 = icmp slt i32 %50, 128
  br i1 %165, label %49, label %.thread.preheader.loopexit

.preheader101:                                    ; preds = %49
  %.lcssa = phi i32 [ %51, %49 ]
  %.098108.lcssa = phi i32 [ %.098108, %49 ]
  %.095109.lcssa = phi float [ %.095109, %49 ]
  %166 = add nsw i32 %.lcssa, %45
  %167 = icmp slt i32 %166, %3
  br i1 %167, label %168, label %.thread.preheader

; <label>:168:                                    ; preds = %.preheader101
  %169 = add nsw i32 %166, %47
  %170 = sext i32 %169 to i64
  %171 = getelementptr inbounds float, float* %40, i64 %170
  %172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
  %173 = fadd float %.095109.lcssa, %172
  %174 = shl i32 %.098108.lcssa, 8
  %175 = or i32 %174, 256
  %176 = add nsw i32 %175, %45
  %177 = icmp slt i32 %176, %3
  br i1 %177, label %190, label %.thread.preheader

; <label>:178:                                    ; preds = %.thread
  %.lcssa138 = phi float [ %180, %.thread ]
  %.not99 = xor i1 %46, true
  %brmerge = or i1 %.not, %.not99
  br i1 %brmerge, label %187, label %183

.thread:                                          ; preds = %.thread.preheader, %.thread
  %.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
  %.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
  %179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
  %180 = fadd float %.8112, %179
  %181 = lshr i32 %.092113, 1
  %182 = icmp eq i32 %181, 0
  br i1 %182, label %178, label %.thread, !llvm.loop !69

; <label>:183:                                    ; preds = %178
  %184 = sext i32 %43 to i64
  %185 = getelementptr inbounds float, float* %38, i64 %184
  %186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8
  br label %187

; <label>:187:                                    ; preds = %178, %183
  %188 = add nuw nsw i32 %.0114, 32
  %189 = icmp slt i32 %188, %32
  br i1 %189, label %41, label %._crit_edge.loopexit

; <label>:190:                                    ; preds = %168
  %191 = add nsw i32 %176, %47
  %192 = sext i32 %191 to i64
  %193 = getelementptr inbounds float, float* %40, i64 %192
  %194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8
  %195 = fadd float %173, %194
  %196 = shl i32 %.098108.lcssa, 8
  %197 = or i32 %196, 512
  %198 = add nsw i32 %197, %45
  %199 = icmp slt i32 %198, %3
  br i1 %199, label %200, label %.thread.preheader

; <label>:200:                                    ; preds = %190
  %201 = add nsw i32 %198, %47
  %202 = sext i32 %201 to i64
  %203 = getelementptr inbounds float, float* %40, i64 %202
  %204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8
  %205 = fadd float %195, %204
  %206 = shl i32 %.098108.lcssa, 8
  %207 = or i32 %206, 768
  %208 = add nsw i32 %207, %45
  %209 = icmp slt i32 %208, %3
  br i1 %209, label %210, label %.thread.preheader

; <label>:210:                                    ; preds = %200
  %211 = add nsw i32 %208, %47
  %212 = sext i32 %211 to i64
  %213 = getelementptr inbounds float, float* %40, i64 %212
  %214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8
  %215 = fadd float %205, %214
  %216 = shl i32 %.098108.lcssa, 8
  %217 = or i32 %216, 1024
  %218 = add nsw i32 %217, %45
  %219 = icmp slt i32 %218, %3
  br i1 %219, label %220, label %.thread.preheader

; <label>:220:                                    ; preds = %210
  %221 = add nsw i32 %218, %47
  %222 = sext i32 %221 to i64
  %223 = getelementptr inbounds float, float* %40, i64 %222
  %224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8
  %225 = fadd float %215, %224
  %226 = shl i32 %.098108.lcssa, 8
  %227 = or i32 %226, 1280
  %228 = add nsw i32 %227, %45
  %229 = icmp slt i32 %228, %3
  br i1 %229, label %230, label %.thread.preheader

; <label>:230:                                    ; preds = %220
  %231 = add nsw i32 %228, %47
  %232 = sext i32 %231 to i64
  %233 = getelementptr inbounds float, float* %40, i64 %232
  %234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8
  %235 = fadd float %225, %234
  %236 = shl i32 %.098108.lcssa, 8
  %237 = or i32 %236, 1536
  %238 = add nsw i32 %237, %45
  %239 = icmp slt i32 %238, %3
  br i1 %239, label %240, label %.thread.preheader

; <label>:240:                                    ; preds = %230
  %241 = add nsw i32 %238, %47
  %242 = sext i32 %241 to i64
  %243 = getelementptr inbounds float, float* %40, i64 %242
  %244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8
  %245 = fadd float %235, %244
  %246 = shl i32 %.098108.lcssa, 8
  %247 = or i32 %246, 1792
  %248 = add nsw i32 %247, %45
  %249 = icmp slt i32 %248, %3
  br i1 %249, label %250, label %.thread.preheader

; <label>:250:                                    ; preds = %240
  %251 = add nsw i32 %248, %47
  %252 = sext i32 %251 to i64
  %253 = getelementptr inbounds float, float* %40, i64 %252
  %254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8
  %255 = fadd float %245, %254
  %256 = shl i32 %.098108.lcssa, 8
  %257 = or i32 %256, 2048
  %258 = add nsw i32 %257, %45
  %259 = icmp slt i32 %258, %3
  br i1 %259, label %260, label %.thread.preheader

; <label>:260:                                    ; preds = %250
  %261 = add nsw i32 %258, %47
  %262 = sext i32 %261 to i64
  %263 = getelementptr inbounds float, float* %40, i64 %262
  %264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8
  %265 = fadd float %255, %264
  %266 = shl i32 %.098108.lcssa, 8
  %267 = or i32 %266, 2304
  %268 = add nsw i32 %267, %45
  %269 = icmp slt i32 %268, %3
  br i1 %269, label %270, label %.thread.preheader

; <label>:270:                                    ; preds = %260
  %271 = add nsw i32 %268, %47
  %272 = sext i32 %271 to i64
  %273 = getelementptr inbounds float, float* %40, i64 %272
  %274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8
  %275 = fadd float %265, %274
  %276 = shl i32 %.098108.lcssa, 8
  %277 = or i32 %276, 2560
  %278 = add nsw i32 %277, %45
  %279 = icmp slt i32 %278, %3
  br i1 %279, label %280, label %.thread.preheader

; <label>:280:                                    ; preds = %270
  %281 = add nsw i32 %278, %47
  %282 = sext i32 %281 to i64
  %283 = getelementptr inbounds float, float* %40, i64 %282
  %284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8
  %285 = fadd float %275, %284
  %286 = shl i32 %.098108.lcssa, 8
  %287 = or i32 %286, 2816
  %288 = add nsw i32 %287, %45
  %289 = icmp slt i32 %288, %3
  br i1 %289, label %290, label %.thread.preheader

; <label>:290:                                    ; preds = %280
  %291 = add nsw i32 %288, %47
  %292 = sext i32 %291 to i64
  %293 = getelementptr inbounds float, float* %40, i64 %292
  %294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8
  %295 = fadd float %285, %294
  %296 = shl i32 %.098108.lcssa, 8
  %297 = or i32 %296, 3072
  %298 = add nsw i32 %297, %45
  %299 = icmp slt i32 %298, %3
  br i1 %299, label %300, label %.thread.preheader

; <label>:300:                                    ; preds = %290
  %301 = add nsw i32 %298, %47
  %302 = sext i32 %301 to i64
  %303 = getelementptr inbounds float, float* %40, i64 %302
  %304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8
  %305 = fadd float %295, %304
  %306 = shl i32 %.098108.lcssa, 8
  %307 = or i32 %306, 3328
  %308 = add nsw i32 %307, %45
  %309 = icmp slt i32 %308, %3
  br i1 %309, label %310, label %.thread.preheader

; <label>:310:                                    ; preds = %300
  %311 = add nsw i32 %308, %47
  %312 = sext i32 %311 to i64
  %313 = getelementptr inbounds float, float* %40, i64 %312
  %314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8
  %315 = fadd float %305, %314
  %316 = shl i32 %.098108.lcssa, 8
  %317 = or i32 %316, 3584
  %318 = add nsw i32 %317, %45
  %319 = icmp slt i32 %318, %3
  br i1 %319, label %320, label %.thread.preheader

; <label>:320:                                    ; preds = %310
  %321 = add nsw i32 %318, %47
  %322 = sext i32 %321 to i64
  %323 = getelementptr inbounds float, float* %40, i64 %322
  %324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8
  %325 = fadd float %315, %324
  br label %.thread.preheader
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 32
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %3, 32767
  %31 = sdiv i32 %30, 32768
  %32 = mul nsw i32 %31, %2
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = icmp slt i32 %33, %32
  br i1 %35, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
  %36 = and i32 %34, 31
  %.not = icmp ne i32 %36, 0
  %37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
  %38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
  br label %39

._crit_edge.loopexit:                             ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %29
  ret void

; <label>:39:                                     ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  %.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ]
  %40 = srem i32 %.0114, %31
  %41 = sdiv i32 %.0114, %31
  %42 = shl nsw i32 %40, 15
  %43 = or i32 %42, %34
  %.idx.val = load float, float* %.idx, align 4
  %44 = icmp slt i32 %41, %2
  br i1 %44, label %.preheader102, label %.thread.preheader

.thread.preheader.loopexit:                       ; preds = %.preheader.preheader
  %.lcssa137 = phi float [ %163, %.preheader.preheader ]
  br label %.thread.preheader

.thread.preheader:                                ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39
  %.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
  br label %.thread

.preheader102:                                    ; preds = %39
  %45 = mul nsw i32 %41, %3
  %46 = add i32 %45, %43
  %47 = load float*, float** %38, align 8
  br label %48

; <label>:48:                                     ; preds = %.preheader102, %.preheader.preheader
  %.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ]
  %.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ]
  %49 = add nuw nsw i32 %.098108, 16
  %50 = shl i32 %.098108, 8
  %51 = or i32 %50, 3840
  %52 = add nsw i32 %51, %43
  %53 = icmp slt i32 %52, %3
  br i1 %53, label %.preheader.preheader, label %.preheader101

.preheader.preheader:                             ; preds = %48
  %54 = add i32 %46, %50
  %55 = sext i32 %54 to i64
  %56 = getelementptr inbounds float, float* %47, i64 %55
  %57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8
  %58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8
  %59 = shl i32 %.098108, 8
  %60 = or i32 %59, 256
  %61 = add i32 %46, %60
  %62 = sext i32 %61 to i64
  %63 = getelementptr inbounds float, float* %47, i64 %62
  %64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8
  %65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8
  %66 = shl i32 %.098108, 8
  %67 = or i32 %66, 512
  %68 = add i32 %46, %67
  %69 = sext i32 %68 to i64
  %70 = getelementptr inbounds float, float* %47, i64 %69
  %71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8
  %72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8
  %73 = shl i32 %.098108, 8
  %74 = or i32 %73, 768
  %75 = add i32 %46, %74
  %76 = sext i32 %75 to i64
  %77 = getelementptr inbounds float, float* %47, i64 %76
  %78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
  %79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8
  %80 = shl i32 %.098108, 8
  %81 = or i32 %80, 1024
  %82 = add i32 %46, %81
  %83 = sext i32 %82 to i64
  %84 = getelementptr inbounds float, float* %47, i64 %83
  %85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8
  %86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8
  %87 = shl i32 %.098108, 8
  %88 = or i32 %87, 1280
  %89 = add i32 %46, %88
  %90 = sext i32 %89 to i64
  %91 = getelementptr inbounds float, float* %47, i64 %90
  %92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8
  %93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8
  %94 = shl i32 %.098108, 8
  %95 = or i32 %94, 1536
  %96 = add i32 %46, %95
  %97 = sext i32 %96 to i64
  %98 = getelementptr inbounds float, float* %47, i64 %97
  %99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8
  %100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8
  %101 = shl i32 %.098108, 8
  %102 = or i32 %101, 1792
  %103 = add i32 %46, %102
  %104 = sext i32 %103 to i64
  %105 = getelementptr inbounds float, float* %47, i64 %104
  %106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8
  %107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8
  %108 = shl i32 %.098108, 8
  %109 = or i32 %108, 2048
  %110 = add i32 %46, %109
  %111 = sext i32 %110 to i64
  %112 = getelementptr inbounds float, float* %47, i64 %111
  %113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8
  %114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8
  %115 = shl i32 %.098108, 8
  %116 = or i32 %115, 2304
  %117 = add i32 %46, %116
  %118 = sext i32 %117 to i64
  %119 = getelementptr inbounds float, float* %47, i64 %118
  %120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8
  %121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8
  %122 = shl i32 %.098108, 8
  %123 = or i32 %122, 2560
  %124 = add i32 %46, %123
  %125 = sext i32 %124 to i64
  %126 = getelementptr inbounds float, float* %47, i64 %125
  %127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8
  %128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8
  %129 = shl i32 %.098108, 8
  %130 = or i32 %129, 2816
  %131 = add i32 %46, %130
  %132 = sext i32 %131 to i64
  %133 = getelementptr inbounds float, float* %47, i64 %132
  %134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8
  %135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8
  %136 = shl i32 %.098108, 8
  %137 = or i32 %136, 3072
  %138 = add i32 %46, %137
  %139 = sext i32 %138 to i64
  %140 = getelementptr inbounds float, float* %47, i64 %139
  %141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8
  %142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8
  %143 = shl i32 %.098108, 8
  %144 = or i32 %143, 3328
  %145 = add i32 %46, %144
  %146 = sext i32 %145 to i64
  %147 = getelementptr inbounds float, float* %47, i64 %146
  %148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8
  %149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8
  %150 = shl i32 %.098108, 8
  %151 = or i32 %150, 3584
  %152 = add i32 %46, %151
  %153 = sext i32 %152 to i64
  %154 = getelementptr inbounds float, float* %47, i64 %153
  %155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8
  %156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8
  %157 = shl i32 %.098108, 8
  %158 = or i32 %157, 3840
  %159 = add i32 %46, %158
  %160 = sext i32 %159 to i64
  %161 = getelementptr inbounds float, float* %47, i64 %160
  %162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8
  %163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8
  %164 = icmp slt i32 %49, 128
  br i1 %164, label %48, label %.thread.preheader.loopexit

.preheader101:                                    ; preds = %48
  %.lcssa = phi i32 [ %50, %48 ]
  %.098108.lcssa = phi i32 [ %.098108, %48 ]
  %.095109.lcssa = phi float [ %.095109, %48 ]
  %165 = load float*, float** %38, align 8
  %166 = add nsw i32 %.lcssa, %43
  %167 = icmp slt i32 %166, %3
  br i1 %167, label %168, label %.thread.preheader

; <label>:168:                                    ; preds = %.preheader101
  %169 = add nsw i32 %166, %45
  %170 = sext i32 %169 to i64
  %171 = getelementptr inbounds float, float* %165, i64 %170
  %172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
  %173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8
  %174 = shl i32 %.098108.lcssa, 8
  %175 = or i32 %174, 256
  %176 = add nsw i32 %175, %43
  %177 = icmp slt i32 %176, %3
  br i1 %177, label %198, label %.thread.preheader

; <label>:178:                                    ; preds = %.thread
  %.lcssa138 = phi float [ %180, %.thread ]
  %.not99 = xor i1 %44, true
  %brmerge = or i1 %.not, %.not99
  br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183

.thread:                                          ; preds = %.thread.preheader, %.thread
  %.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
  %.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
  %179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
  %180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8
  %181 = lshr i32 %.092113, 1
  %182 = icmp eq i32 %181, 0
  br i1 %182, label %178, label %.thread, !llvm.loop !70

; <label>:183:                                    ; preds = %178
  %184 = load float*, float** %37, align 8
  %185 = sext i32 %41 to i64
  %186 = getelementptr inbounds float, float* %184, i64 %185
  %187 = bitcast float %.lcssa138 to i32
  %188 = bitcast float* %186 to i32*
  %189 = load i32, i32* %188, align 4
  br label %190

; <label>:190:                                    ; preds = %193, %183
  %.011.i = phi i32 [ %189, %183 ], [ %195, %193 ]
  %191 = bitcast i32 %.011.i to float
  %192 = fcmp olt float %191, %.lcssa138
  br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit

; <label>:193:                                    ; preds = %190
  %194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst
  %195 = extractvalue { i32, i1 } %194, 0
  %not..i = icmp eq i32 %.011.i, %195
  br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193
  br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178
  %196 = add nuw nsw i32 %.0114, 32
  %197 = icmp slt i32 %196, %32
  br i1 %197, label %39, label %._crit_edge.loopexit

; <label>:198:                                    ; preds = %168
  %199 = add nsw i32 %176, %45
  %200 = sext i32 %199 to i64
  %201 = getelementptr inbounds float, float* %165, i64 %200
  %202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8
  %203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8
  %204 = shl i32 %.098108.lcssa, 8
  %205 = or i32 %204, 512
  %206 = add nsw i32 %205, %43
  %207 = icmp slt i32 %206, %3
  br i1 %207, label %208, label %.thread.preheader

; <label>:208:                                    ; preds = %198
  %209 = add nsw i32 %206, %45
  %210 = sext i32 %209 to i64
  %211 = getelementptr inbounds float, float* %165, i64 %210
  %212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8
  %213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8
  %214 = shl i32 %.098108.lcssa, 8
  %215 = or i32 %214, 768
  %216 = add nsw i32 %215, %43
  %217 = icmp slt i32 %216, %3
  br i1 %217, label %218, label %.thread.preheader

; <label>:218:                                    ; preds = %208
  %219 = add nsw i32 %216, %45
  %220 = sext i32 %219 to i64
  %221 = getelementptr inbounds float, float* %165, i64 %220
  %222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8
  %223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8
  %224 = shl i32 %.098108.lcssa, 8
  %225 = or i32 %224, 1024
  %226 = add nsw i32 %225, %43
  %227 = icmp slt i32 %226, %3
  br i1 %227, label %228, label %.thread.preheader

; <label>:228:                                    ; preds = %218
  %229 = add nsw i32 %226, %45
  %230 = sext i32 %229 to i64
  %231 = getelementptr inbounds float, float* %165, i64 %230
  %232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8
  %233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8
  %234 = shl i32 %.098108.lcssa, 8
  %235 = or i32 %234, 1280
  %236 = add nsw i32 %235, %43
  %237 = icmp slt i32 %236, %3
  br i1 %237, label %238, label %.thread.preheader

; <label>:238:                                    ; preds = %228
  %239 = add nsw i32 %236, %45
  %240 = sext i32 %239 to i64
  %241 = getelementptr inbounds float, float* %165, i64 %240
  %242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8
  %243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8
  %244 = shl i32 %.098108.lcssa, 8
  %245 = or i32 %244, 1536
  %246 = add nsw i32 %245, %43
  %247 = icmp slt i32 %246, %3
  br i1 %247, label %248, label %.thread.preheader

; <label>:248:                                    ; preds = %238
  %249 = add nsw i32 %246, %45
  %250 = sext i32 %249 to i64
  %251 = getelementptr inbounds float, float* %165, i64 %250
  %252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8
  %253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8
  %254 = shl i32 %.098108.lcssa, 8
  %255 = or i32 %254, 1792
  %256 = add nsw i32 %255, %43
  %257 = icmp slt i32 %256, %3
  br i1 %257, label %258, label %.thread.preheader

; <label>:258:                                    ; preds = %248
  %259 = add nsw i32 %256, %45
  %260 = sext i32 %259 to i64
  %261 = getelementptr inbounds float, float* %165, i64 %260
  %262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8
  %263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8
  %264 = shl i32 %.098108.lcssa, 8
  %265 = or i32 %264, 2048
  %266 = add nsw i32 %265, %43
  %267 = icmp slt i32 %266, %3
  br i1 %267, label %268, label %.thread.preheader

; <label>:268:                                    ; preds = %258
  %269 = add nsw i32 %266, %45
  %270 = sext i32 %269 to i64
  %271 = getelementptr inbounds float, float* %165, i64 %270
  %272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8
  %273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8
  %274 = shl i32 %.098108.lcssa, 8
  %275 = or i32 %274, 2304
  %276 = add nsw i32 %275, %43
  %277 = icmp slt i32 %276, %3
  br i1 %277, label %278, label %.thread.preheader

; <label>:278:                                    ; preds = %268
  %279 = add nsw i32 %276, %45
  %280 = sext i32 %279 to i64
  %281 = getelementptr inbounds float, float* %165, i64 %280
  %282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8
  %283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8
  %284 = shl i32 %.098108.lcssa, 8
  %285 = or i32 %284, 2560
  %286 = add nsw i32 %285, %43
  %287 = icmp slt i32 %286, %3
  br i1 %287, label %288, label %.thread.preheader

; <label>:288:                                    ; preds = %278
  %289 = add nsw i32 %286, %45
  %290 = sext i32 %289 to i64
  %291 = getelementptr inbounds float, float* %165, i64 %290
  %292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8
  %293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8
  %294 = shl i32 %.098108.lcssa, 8
  %295 = or i32 %294, 2816
  %296 = add nsw i32 %295, %43
  %297 = icmp slt i32 %296, %3
  br i1 %297, label %298, label %.thread.preheader

; <label>:298:                                    ; preds = %288
  %299 = add nsw i32 %296, %45
  %300 = sext i32 %299 to i64
  %301 = getelementptr inbounds float, float* %165, i64 %300
  %302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8
  %303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8
  %304 = shl i32 %.098108.lcssa, 8
  %305 = or i32 %304, 3072
  %306 = add nsw i32 %305, %43
  %307 = icmp slt i32 %306, %3
  br i1 %307, label %308, label %.thread.preheader

; <label>:308:                                    ; preds = %298
  %309 = add nsw i32 %306, %45
  %310 = sext i32 %309 to i64
  %311 = getelementptr inbounds float, float* %165, i64 %310
  %312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8
  %313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8
  %314 = shl i32 %.098108.lcssa, 8
  %315 = or i32 %314, 3328
  %316 = add nsw i32 %315, %43
  %317 = icmp slt i32 %316, %3
  br i1 %317, label %318, label %.thread.preheader

; <label>:318:                                    ; preds = %308
  %319 = add nsw i32 %316, %45
  %320 = sext i32 %319 to i64
  %321 = getelementptr inbounds float, float* %165, i64 %320
  %322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8
  %323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8
  %324 = shl i32 %.098108.lcssa, 8
  %325 = or i32 %324, 3584
  %326 = add nsw i32 %325, %43
  %327 = icmp slt i32 %326, %3
  br i1 %327, label %328, label %.thread.preheader

; <label>:328:                                    ; preds = %318
  %329 = add nsw i32 %326, %45
  %330 = sext i32 %329 to i64
  %331 = getelementptr inbounds float, float* %165, i64 %330
  %332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8
  %333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8
  br label %.thread.preheader
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 128
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %2, 15
  %31 = sdiv i32 %30, 16
  %32 = mul nsw i32 %31, %3
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = shl nuw nsw i32 %33, 8
  %36 = add nuw nsw i32 %35, %34
  %37 = icmp slt i32 %36, %32
  br i1 %37, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %38 = icmp sgt i32 %3, -1
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
  %40 = load float*, float** %39, align 8
  %41 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
  %42 = load float*, float** %41, align 8
  br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader

.lr.ph.split.preheader:                           ; preds = %.lr.ph
  %43 = add i32 %32, -1
  %44 = sub i32 %43, %34
  %45 = sub i32 %44, %35
  %46 = lshr i32 %45, 15
  %47 = add nuw nsw i32 %46, 1
  %xtraiter = and i32 %47, 3
  %48 = icmp ult i32 %45, 98304
  br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new

.lr.ph.split.preheader.new:                       ; preds = %.lr.ph.split.preheader
  %unroll_iter = sub nsw i32 %47, %xtraiter
  br label %.lr.ph.split

.lr.ph.split.us.preheader:                        ; preds = %.lr.ph
  br label %.lr.ph.split.us

.lr.ph.split.us:                                  ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us
  %.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ]
  %49 = srem i32 %.047.us, %3
  %50 = sdiv i32 %.047.us, %3
  %51 = srem i32 %50, %31
  %52 = shl nsw i32 %51, 4
  br label %53

; <label>:53:                                     ; preds = %104, %.lr.ph.split.us
  %.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ]
  %.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ]
  %54 = add nuw nsw i32 %.04346.us.us, %52
  %55 = icmp slt i32 %54, %2
  br i1 %55, label %56, label %62

; <label>:56:                                     ; preds = %53
  %57 = mul nsw i32 %54, %3
  %58 = add nsw i32 %57, %49
  %59 = sext i32 %58 to i64
  %60 = getelementptr inbounds float, float* %40, i64 %59
  %61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8
  br label %62

; <label>:62:                                     ; preds = %56, %53
  %63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ]
  %64 = fadd float %.04445.us.us, %63
  %65 = or i32 %.04346.us.us, 1
  %66 = add nuw nsw i32 %65, %52
  %67 = icmp slt i32 %66, %2
  br i1 %67, label %98, label %104

.us-lcssa.us.us:                                  ; preds = %104
  %.lcssa = phi float [ %106, %104 ]
  %68 = sext i32 %49 to i64
  %69 = getelementptr inbounds float, float* %42, i64 %68
  %70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8
  %71 = add nuw nsw i32 %.047.us, 32768
  %72 = icmp slt i32 %71, %32
  br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit

._crit_edge.loopexit:                             ; preds = %.us-lcssa.us.us
  br label %._crit_edge

._crit_edge.loopexit59.unr-lcssa.loopexit:        ; preds = %.lr.ph.split
  %.lcssa60 = phi i32 [ %97, %.lr.ph.split ]
  br label %._crit_edge.loopexit59.unr-lcssa

._crit_edge.loopexit59.unr-lcssa:                 ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader
  %.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ]
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader

.lr.ph.split.epil.preheader:                      ; preds = %._crit_edge.loopexit59.unr-lcssa
  br label %.lr.ph.split.epil

.lr.ph.split.epil:                                ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader
  %.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ]
  %epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ]
  %73 = srem i32 %.047.epil, %3
  %74 = sext i32 %73 to i64
  %75 = getelementptr inbounds float, float* %42, i64 %74
  %76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8
  %77 = add nuw nsw i32 %.047.epil, 32768
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !71

._crit_edge.loopexit59.epilog-lcssa:              ; preds = %.lr.ph.split.epil
  br label %._crit_edge.loopexit59

._crit_edge.loopexit59:                           ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29
  ret void

.lr.ph.split:                                     ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new
  %.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ]
  %78 = srem i32 %.047, %3
  %79 = sext i32 %78 to i64
  %80 = getelementptr inbounds float, float* %42, i64 %79
  %81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8
  %82 = add nuw nsw i32 %.047, 32768
  %83 = srem i32 %82, %3
  %84 = sext i32 %83 to i64
  %85 = getelementptr inbounds float, float* %42, i64 %84
  %86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8
  %87 = add nsw i32 %.047, 65536
  %88 = srem i32 %87, %3
  %89 = sext i32 %88 to i64
  %90 = getelementptr inbounds float, float* %42, i64 %89
  %91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8
  %92 = add nsw i32 %.047, 98304
  %93 = srem i32 %92, %3
  %94 = sext i32 %93 to i64
  %95 = getelementptr inbounds float, float* %42, i64 %94
  %96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8
  %97 = add nsw i32 %.047, 131072
  %niter.nsub.3 = add i32 %niter, -4
  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
  br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split

; <label>:98:                                     ; preds = %62
  %99 = mul nsw i32 %66, %3
  %100 = add nsw i32 %99, %49
  %101 = sext i32 %100 to i64
  %102 = getelementptr inbounds float, float* %40, i64 %101
  %103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8
  br label %104

; <label>:104:                                    ; preds = %98, %62
  %105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ]
  %106 = fadd float %64, %105
  %107 = add nsw i32 %.04346.us.us, 2
  %exitcond.1 = icmp eq i32 %107, 16
  br i1 %exitcond.1, label %.us-lcssa.us.us, label %53
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 128
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %2, 15
  %31 = sdiv i32 %30, 16
  %32 = mul nsw i32 %31, %3
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = shl nuw nsw i32 %33, 8
  %36 = add nuw nsw i32 %35, %34
  %37 = icmp slt i32 %36, %32
  br i1 %37, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
  %38 = icmp sgt i32 %3, -1
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
  %40 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
  br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader

.lr.ph.split.preheader:                           ; preds = %.lr.ph
  br label %.lr.ph.split

.lr.ph.split.us.preheader:                        ; preds = %.lr.ph
  br label %.lr.ph.split.us

.lr.ph.split.us:                                  ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
  %.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ]
  %41 = srem i32 %.048.us, %3
  %42 = sdiv i32 %.048.us, %3
  %43 = srem i32 %42, %31
  %44 = shl nsw i32 %43, 4
  %.idx45.val.us = load float, float* %.idx45, align 4
  %45 = load float*, float** %39, align 8
  br label %54

; <label>:46:                                     ; preds = %49, %.us-lcssa.us.us
  %.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ]
  %47 = bitcast i32 %.011.i.us to float
  %48 = fcmp olt float %47, %.lcssa
  br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us

; <label>:49:                                     ; preds = %46
  %50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst
  %51 = extractvalue { i32, i1 } %50, 0
  %not..i.us = icmp eq i32 %.011.i.us, %51
  br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46
  %52 = add nuw nsw i32 %.048.us, 32768
  %53 = icmp slt i32 %52, %32
  br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit

; <label>:54:                                     ; preds = %112, %.lr.ph.split.us
  %.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ]
  %.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ]
  %55 = add nuw nsw i32 %.04347.us.us, %44
  %56 = icmp slt i32 %55, %2
  br i1 %56, label %57, label %63

; <label>:57:                                     ; preds = %54
  %58 = mul nsw i32 %55, %3
  %59 = add nsw i32 %58, %41
  %60 = sext i32 %59 to i64
  %61 = getelementptr inbounds float, float* %45, i64 %60
  %62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8
  br label %63

; <label>:63:                                     ; preds = %54, %57
  %64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ]
  %65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8
  %66 = or i32 %.04347.us.us, 1
  %67 = add nuw nsw i32 %66, %44
  %68 = icmp slt i32 %67, %2
  br i1 %68, label %106, label %112

.us-lcssa.us.us:                                  ; preds = %112
  %.lcssa = phi float [ %114, %112 ]
  %69 = load float*, float** %40, align 8
  %70 = sext i32 %41 to i64
  %71 = getelementptr inbounds float, float* %69, i64 %70
  %72 = bitcast float %.lcssa to i32
  %73 = bitcast float* %71 to i32*
  %74 = load i32, i32* %73, align 4
  br label %46

._crit_edge.loopexit:                             ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
  br label %._crit_edge

._crit_edge.loopexit60:                           ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29
  ret void

.lr.ph.split:                                     ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  %.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ]
  %.idx45.val = load float, float* %.idx45, align 4
  %75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8
  %76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8
  %77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8
  %78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8
  %79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8
  %80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8
  %81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8
  %82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8
  %83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8
  %84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8
  %85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8
  %86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8
  %87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8
  %88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8
  %89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8
  %90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8
  %91 = srem i32 %.048, %3
  %92 = load float*, float** %40, align 8
  %93 = sext i32 %91 to i64
  %94 = getelementptr inbounds float, float* %92, i64 %93
  %95 = bitcast float %90 to i32
  %96 = bitcast float* %94 to i32*
  %97 = load i32, i32* %96, align 4
  br label %98

; <label>:98:                                     ; preds = %101, %.lr.ph.split
  %.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ]
  %99 = bitcast i32 %.011.i to float
  %100 = fcmp olt float %99, %90
  br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit

; <label>:101:                                    ; preds = %98
  %102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst
  %103 = extractvalue { i32, i1 } %102, 0
  %not..i = icmp eq i32 %.011.i, %103
  br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101
  %104 = add nuw nsw i32 %.048, 32768
  %105 = icmp slt i32 %104, %32
  br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60

; <label>:106:                                    ; preds = %63
  %107 = mul nsw i32 %67, %3
  %108 = add nsw i32 %107, %41
  %109 = sext i32 %108 to i64
  %110 = getelementptr inbounds float, float* %45, i64 %109
  %111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8
  br label %112

; <label>:112:                                    ; preds = %106, %63
  %113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ]
  %114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8
  %115 = add nsw i32 %.04347.us.us, 2
  %exitcond.1 = icmp eq i32 %115, 16
  br i1 %exitcond.1, label %.us-lcssa.us.us, label %54
}

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.15"* byval align 8, i32) #0 comdat {
  %3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %5 = mul nuw nsw i32 %4, %3
  %6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %7 = add nuw nsw i32 %5, %6
  %8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %9 = mul nuw nsw i32 %8, %4
  %.sroa.021.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %0, i64 0, i32 0, i32 0
  %.sroa.021.0.copyload = load float*, float** %.sroa.021.0..sroa_idx, align 8
  %.sroa.5.0..sroa_idx25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %0, i64 0, i32 1, i32 3
  %.sroa.5.0.copyload = load float*, float** %.sroa.5.0..sroa_idx25, align 8
  %10 = icmp slt i32 %7, %1
  br i1 %10, label %.lr.ph.i.preheader, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit

.lr.ph.i.preheader:                               ; preds = %2
  br label %.lr.ph.i

.lr.ph.i:                                         ; preds = %.lr.ph.i.preheader, %.lr.ph.i
  %.07.i = phi i32 [ %17, %.lr.ph.i ], [ %7, %.lr.ph.i.preheader ]
  %11 = sext i32 %.07.i to i64
  %12 = getelementptr inbounds float, float* %.sroa.5.0.copyload, i64 %11
  %13 = bitcast float* %12 to i32*
  %14 = load i32, i32* %13, align 4
  %15 = getelementptr inbounds float, float* %.sroa.021.0.copyload, i64 %11
  %16 = bitcast float* %15 to i32*
  store i32 %14, i32* %16, align 4
  %17 = add nsw i32 %.07.i, %9
  %18 = icmp slt i32 %17, %1
  br i1 %18, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit: ; preds = %.lr.ph.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit, %2
  ret void
}

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.24"* byval align 8, i32) #0 comdat {
  %3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %5 = mul nuw nsw i32 %4, %3
  %6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %7 = add nuw nsw i32 %5, %6
  %8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %9 = mul nuw nsw i32 %8, %4
  %.sroa.041.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 0, i32 0
  %.sroa.041.0.copyload = load float*, float** %.sroa.041.0..sroa_idx, align 8
  %.sroa.545.0..sroa_idx46 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 1, i32 7
  %.sroa.545.0..sroa_cast = bitcast i32* %.sroa.545.0..sroa_idx46 to i64*
  %.sroa.545.0.copyload = load i64, i64* %.sroa.545.0..sroa_cast, align 8
  %.sroa.648.0..sroa_idx49 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 1, i32 9, i32 0, i64 0
  %.sroa.648.0.copyload = load i32, i32* %.sroa.648.0..sroa_idx49, align 8
  %.sroa.8.0..sroa_idx53 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 1, i32 10, i32 0
  %.sroa.8.0.copyload = load float*, float** %.sroa.8.0..sroa_idx53, align 8
  %10 = icmp slt i32 %7, %1
  br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit

.lr.ph.i:                                         ; preds = %2
  %11 = trunc i64 %.sroa.545.0.copyload to i32
  %12 = icmp sgt i32 %.sroa.648.0.copyload, 0
  %13 = lshr i64 %.sroa.545.0.copyload, 32
  %14 = trunc i64 %13 to i32
  br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader

.lr.ph.split.i.preheader:                         ; preds = %.lr.ph.i
  br label %.lr.ph.split.i

.lr.ph.split.us.i.preheader:                      ; preds = %.lr.ph.i
  %15 = add i32 %.sroa.648.0.copyload, -1
  %xtraiter = and i32 %.sroa.648.0.copyload, 3
  %16 = icmp ult i32 %15, 3
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  %unroll_iter = sub i32 %.sroa.648.0.copyload, %xtraiter
  br label %.lr.ph.split.us.i

.lr.ph.split.us.i:                                ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
  %.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ]
  %17 = mul nsw i32 %.07.us.i, %11
  br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new

.lr.ph.split.us.i.new:                            ; preds = %.lr.ph.split.us.i
  br label %18

; <label>:18:                                     ; preds = %18, %.lr.ph.split.us.i.new
  %19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ]
  %.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ]
  %20 = mul nsw i32 %.012.i.i.i.us.i, %14
  %21 = add nsw i32 %20, %17
  %22 = sext i32 %21 to i64
  %23 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %22
  %24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8
  %25 = fadd float %19, %24
  %26 = or i32 %.012.i.i.i.us.i, 1
  %27 = mul nsw i32 %26, %14
  %28 = add nsw i32 %27, %17
  %29 = sext i32 %28 to i64
  %30 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %29
  %31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8
  %32 = fadd float %25, %31
  %33 = or i32 %.012.i.i.i.us.i, 2
  %34 = mul nsw i32 %33, %14
  %35 = add nsw i32 %34, %17
  %36 = sext i32 %35 to i64
  %37 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %36
  %38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8
  %39 = fadd float %32, %38
  %40 = or i32 %.012.i.i.i.us.i, 3
  %41 = mul nsw i32 %40, %14
  %42 = add nsw i32 %41, %17
  %43 = sext i32 %42 to i64
  %44 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %43
  %45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8
  %46 = fadd float %39, %45
  %47 = add nsw i32 %.012.i.i.i.us.i, 4
  %niter.nsub.3 = add i32 %niter, -4
  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
  br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18

_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18
  %.lcssa67 = phi i32 [ %47, %18 ]
  %.lcssa66 = phi float [ %46, %18 ]
  br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa

_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i
  %.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  %.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  %.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
  br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader

.epil.preheader:                                  ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
  br label %48

; <label>:48:                                     ; preds = %48, %.epil.preheader
  %49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ]
  %.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ]
  %epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ]
  %50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14
  %51 = add nsw i32 %50, %17
  %52 = sext i32 %51 to i64
  %53 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %52
  %54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
  %55 = fadd float %49, %54
  %56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !72

_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48
  %.lcssa68 = phi float [ %55, %48 ]
  br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i

_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa
  %.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa68, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ]
  %57 = sext i32 %.07.us.i to i64
  %58 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %57
  store float %.lcssa, float* %58, align 4
  %59 = add nsw i32 %.07.us.i, %9
  %60 = icmp slt i32 %59, %1
  br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit

.lr.ph.split.i:                                   ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i
  %.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ]
  %61 = sext i32 %.07.i to i64
  %62 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %61
  store float 0.000000e+00, float* %62, align 4
  %63 = add nsw i32 %.07.i, %9
  %64 = icmp slt i32 %63, %1
  br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64: ; preds = %.lr.ph.split.i
  br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit

_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit, %2
  ret void
}

; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat {
  %4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %6 = mul nuw nsw i32 %5, %4
  %7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %8 = add nuw nsw i32 %6, %7
  %9 = icmp slt i32 %8, %1
  br i1 %9, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %3
  %10 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %2, i64 0, i32 0
  %11 = load float*, float** %10, align 8
  %12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %13 = mul nuw nsw i32 %12, %5
  br label %14

._crit_edge.loopexit:                             ; preds = %14
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %3
  ret void

; <label>:14:                                     ; preds = %.lr.ph, %14
  %.08 = phi i32 [ %8, %.lr.ph ], [ %17, %14 ]
  %15 = sext i32 %.08 to i64
  %16 = getelementptr inbounds float, float* %11, i64 %15
  store float %0, float* %16, align 4
  %17 = add i32 %13, %.08
  %18 = icmp slt i32 %17, %1
  br i1 %18, label %14, label %._crit_edge.loopexit
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 32
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %3, 32767
  %31 = sdiv i32 %30, 32768
  %32 = mul nsw i32 %31, %2
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = icmp slt i32 %33, %32
  br i1 %35, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %36 = and i32 %34, 31
  %.not = icmp ne i32 %36, 0
  %37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0
  %38 = load float*, float** %37, align 8
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
  %40 = load float*, float** %39, align 8
  br label %41

._crit_edge.loopexit:                             ; preds = %187
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %29
  ret void

; <label>:41:                                     ; preds = %.lr.ph, %187
  %.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ]
  %42 = srem i32 %.0114, %31
  %43 = sdiv i32 %.0114, %31
  %44 = shl nsw i32 %42, 15
  %45 = or i32 %44, %34
  %46 = icmp slt i32 %43, %2
  br i1 %46, label %.preheader102, label %.thread.preheader

.thread.preheader.loopexit:                       ; preds = %.preheader.preheader
  %.lcssa137 = phi float [ %164, %.preheader.preheader ]
  br label %.thread.preheader

.thread.preheader:                                ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41
  %.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
  br label %.thread

.preheader102:                                    ; preds = %41
  %47 = mul nsw i32 %43, %3
  %48 = add i32 %47, %45
  br label %49

; <label>:49:                                     ; preds = %.preheader102, %.preheader.preheader
  %.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ]
  %.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ]
  %50 = add nuw nsw i32 %.098108, 16
  %51 = shl i32 %.098108, 8
  %52 = or i32 %51, 3840
  %53 = add nsw i32 %52, %45
  %54 = icmp slt i32 %53, %3
  br i1 %54, label %.preheader.preheader, label %.preheader101

.preheader.preheader:                             ; preds = %49
  %55 = add i32 %48, %51
  %56 = sext i32 %55 to i64
  %57 = getelementptr inbounds float, float* %40, i64 %56
  %58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8
  %59 = fadd float %.095109, %58
  %60 = shl i32 %.098108, 8
  %61 = or i32 %60, 256
  %62 = add i32 %48, %61
  %63 = sext i32 %62 to i64
  %64 = getelementptr inbounds float, float* %40, i64 %63
  %65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8
  %66 = fadd float %59, %65
  %67 = shl i32 %.098108, 8
  %68 = or i32 %67, 512
  %69 = add i32 %48, %68
  %70 = sext i32 %69 to i64
  %71 = getelementptr inbounds float, float* %40, i64 %70
  %72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
  %73 = fadd float %66, %72
  %74 = shl i32 %.098108, 8
  %75 = or i32 %74, 768
  %76 = add i32 %48, %75
  %77 = sext i32 %76 to i64
  %78 = getelementptr inbounds float, float* %40, i64 %77
  %79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8
  %80 = fadd float %73, %79
  %81 = shl i32 %.098108, 8
  %82 = or i32 %81, 1024
  %83 = add i32 %48, %82
  %84 = sext i32 %83 to i64
  %85 = getelementptr inbounds float, float* %40, i64 %84
  %86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8
  %87 = fadd float %80, %86
  %88 = shl i32 %.098108, 8
  %89 = or i32 %88, 1280
  %90 = add i32 %48, %89
  %91 = sext i32 %90 to i64
  %92 = getelementptr inbounds float, float* %40, i64 %91
  %93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8
  %94 = fadd float %87, %93
  %95 = shl i32 %.098108, 8
  %96 = or i32 %95, 1536
  %97 = add i32 %48, %96
  %98 = sext i32 %97 to i64
  %99 = getelementptr inbounds float, float* %40, i64 %98
  %100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8
  %101 = fadd float %94, %100
  %102 = shl i32 %.098108, 8
  %103 = or i32 %102, 1792
  %104 = add i32 %48, %103
  %105 = sext i32 %104 to i64
  %106 = getelementptr inbounds float, float* %40, i64 %105
  %107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8
  %108 = fadd float %101, %107
  %109 = shl i32 %.098108, 8
  %110 = or i32 %109, 2048
  %111 = add i32 %48, %110
  %112 = sext i32 %111 to i64
  %113 = getelementptr inbounds float, float* %40, i64 %112
  %114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8
  %115 = fadd float %108, %114
  %116 = shl i32 %.098108, 8
  %117 = or i32 %116, 2304
  %118 = add i32 %48, %117
  %119 = sext i32 %118 to i64
  %120 = getelementptr inbounds float, float* %40, i64 %119
  %121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8
  %122 = fadd float %115, %121
  %123 = shl i32 %.098108, 8
  %124 = or i32 %123, 2560
  %125 = add i32 %48, %124
  %126 = sext i32 %125 to i64
  %127 = getelementptr inbounds float, float* %40, i64 %126
  %128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8
  %129 = fadd float %122, %128
  %130 = shl i32 %.098108, 8
  %131 = or i32 %130, 2816
  %132 = add i32 %48, %131
  %133 = sext i32 %132 to i64
  %134 = getelementptr inbounds float, float* %40, i64 %133
  %135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8
  %136 = fadd float %129, %135
  %137 = shl i32 %.098108, 8
  %138 = or i32 %137, 3072
  %139 = add i32 %48, %138
  %140 = sext i32 %139 to i64
  %141 = getelementptr inbounds float, float* %40, i64 %140
  %142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8
  %143 = fadd float %136, %142
  %144 = shl i32 %.098108, 8
  %145 = or i32 %144, 3328
  %146 = add i32 %48, %145
  %147 = sext i32 %146 to i64
  %148 = getelementptr inbounds float, float* %40, i64 %147
  %149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8
  %150 = fadd float %143, %149
  %151 = shl i32 %.098108, 8
  %152 = or i32 %151, 3584
  %153 = add i32 %48, %152
  %154 = sext i32 %153 to i64
  %155 = getelementptr inbounds float, float* %40, i64 %154
  %156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8
  %157 = fadd float %150, %156
  %158 = shl i32 %.098108, 8
  %159 = or i32 %158, 3840
  %160 = add i32 %48, %159
  %161 = sext i32 %160 to i64
  %162 = getelementptr inbounds float, float* %40, i64 %161
  %163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
  %164 = fadd float %157, %163
  %165 = icmp slt i32 %50, 128
  br i1 %165, label %49, label %.thread.preheader.loopexit

.preheader101:                                    ; preds = %49
  %.lcssa = phi i32 [ %51, %49 ]
  %.098108.lcssa = phi i32 [ %.098108, %49 ]
  %.095109.lcssa = phi float [ %.095109, %49 ]
  %166 = add nsw i32 %.lcssa, %45
  %167 = icmp slt i32 %166, %3
  br i1 %167, label %168, label %.thread.preheader

; <label>:168:                                    ; preds = %.preheader101
  %169 = add nsw i32 %166, %47
  %170 = sext i32 %169 to i64
  %171 = getelementptr inbounds float, float* %40, i64 %170
  %172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
  %173 = fadd float %.095109.lcssa, %172
  %174 = shl i32 %.098108.lcssa, 8
  %175 = or i32 %174, 256
  %176 = add nsw i32 %175, %45
  %177 = icmp slt i32 %176, %3
  br i1 %177, label %190, label %.thread.preheader

; <label>:178:                                    ; preds = %.thread
  %.lcssa138 = phi float [ %180, %.thread ]
  %.not99 = xor i1 %46, true
  %brmerge = or i1 %.not, %.not99
  br i1 %brmerge, label %187, label %183

.thread:                                          ; preds = %.thread.preheader, %.thread
  %.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
  %.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
  %179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
  %180 = fadd float %.8112, %179
  %181 = lshr i32 %.092113, 1
  %182 = icmp eq i32 %181, 0
  br i1 %182, label %178, label %.thread, !llvm.loop !73

; <label>:183:                                    ; preds = %178
  %184 = sext i32 %43 to i64
  %185 = getelementptr inbounds float, float* %38, i64 %184
  %186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8
  br label %187

; <label>:187:                                    ; preds = %178, %183
  %188 = add nuw nsw i32 %.0114, 32
  %189 = icmp slt i32 %188, %32
  br i1 %189, label %41, label %._crit_edge.loopexit

; <label>:190:                                    ; preds = %168
  %191 = add nsw i32 %176, %47
  %192 = sext i32 %191 to i64
  %193 = getelementptr inbounds float, float* %40, i64 %192
  %194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8
  %195 = fadd float %173, %194
  %196 = shl i32 %.098108.lcssa, 8
  %197 = or i32 %196, 512
  %198 = add nsw i32 %197, %45
  %199 = icmp slt i32 %198, %3
  br i1 %199, label %200, label %.thread.preheader

; <label>:200:                                    ; preds = %190
  %201 = add nsw i32 %198, %47
  %202 = sext i32 %201 to i64
  %203 = getelementptr inbounds float, float* %40, i64 %202
  %204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8
  %205 = fadd float %195, %204
  %206 = shl i32 %.098108.lcssa, 8
  %207 = or i32 %206, 768
  %208 = add nsw i32 %207, %45
  %209 = icmp slt i32 %208, %3
  br i1 %209, label %210, label %.thread.preheader

; <label>:210:                                    ; preds = %200
  %211 = add nsw i32 %208, %47
  %212 = sext i32 %211 to i64
  %213 = getelementptr inbounds float, float* %40, i64 %212
  %214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8
  %215 = fadd float %205, %214
  %216 = shl i32 %.098108.lcssa, 8
  %217 = or i32 %216, 1024
  %218 = add nsw i32 %217, %45
  %219 = icmp slt i32 %218, %3
  br i1 %219, label %220, label %.thread.preheader

; <label>:220:                                    ; preds = %210
  %221 = add nsw i32 %218, %47
  %222 = sext i32 %221 to i64
  %223 = getelementptr inbounds float, float* %40, i64 %222
  %224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8
  %225 = fadd float %215, %224
  %226 = shl i32 %.098108.lcssa, 8
  %227 = or i32 %226, 1280
  %228 = add nsw i32 %227, %45
  %229 = icmp slt i32 %228, %3
  br i1 %229, label %230, label %.thread.preheader

; <label>:230:                                    ; preds = %220
  %231 = add nsw i32 %228, %47
  %232 = sext i32 %231 to i64
  %233 = getelementptr inbounds float, float* %40, i64 %232
  %234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8
  %235 = fadd float %225, %234
  %236 = shl i32 %.098108.lcssa, 8
  %237 = or i32 %236, 1536
  %238 = add nsw i32 %237, %45
  %239 = icmp slt i32 %238, %3
  br i1 %239, label %240, label %.thread.preheader

; <label>:240:                                    ; preds = %230
  %241 = add nsw i32 %238, %47
  %242 = sext i32 %241 to i64
  %243 = getelementptr inbounds float, float* %40, i64 %242
  %244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8
  %245 = fadd float %235, %244
  %246 = shl i32 %.098108.lcssa, 8
  %247 = or i32 %246, 1792
  %248 = add nsw i32 %247, %45
  %249 = icmp slt i32 %248, %3
  br i1 %249, label %250, label %.thread.preheader

; <label>:250:                                    ; preds = %240
  %251 = add nsw i32 %248, %47
  %252 = sext i32 %251 to i64
  %253 = getelementptr inbounds float, float* %40, i64 %252
  %254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8
  %255 = fadd float %245, %254
  %256 = shl i32 %.098108.lcssa, 8
  %257 = or i32 %256, 2048
  %258 = add nsw i32 %257, %45
  %259 = icmp slt i32 %258, %3
  br i1 %259, label %260, label %.thread.preheader

; <label>:260:                                    ; preds = %250
  %261 = add nsw i32 %258, %47
  %262 = sext i32 %261 to i64
  %263 = getelementptr inbounds float, float* %40, i64 %262
  %264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8
  %265 = fadd float %255, %264
  %266 = shl i32 %.098108.lcssa, 8
  %267 = or i32 %266, 2304
  %268 = add nsw i32 %267, %45
  %269 = icmp slt i32 %268, %3
  br i1 %269, label %270, label %.thread.preheader

; <label>:270:                                    ; preds = %260
  %271 = add nsw i32 %268, %47
  %272 = sext i32 %271 to i64
  %273 = getelementptr inbounds float, float* %40, i64 %272
  %274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8
  %275 = fadd float %265, %274
  %276 = shl i32 %.098108.lcssa, 8
  %277 = or i32 %276, 2560
  %278 = add nsw i32 %277, %45
  %279 = icmp slt i32 %278, %3
  br i1 %279, label %280, label %.thread.preheader

; <label>:280:                                    ; preds = %270
  %281 = add nsw i32 %278, %47
  %282 = sext i32 %281 to i64
  %283 = getelementptr inbounds float, float* %40, i64 %282
  %284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8
  %285 = fadd float %275, %284
  %286 = shl i32 %.098108.lcssa, 8
  %287 = or i32 %286, 2816
  %288 = add nsw i32 %287, %45
  %289 = icmp slt i32 %288, %3
  br i1 %289, label %290, label %.thread.preheader

; <label>:290:                                    ; preds = %280
  %291 = add nsw i32 %288, %47
  %292 = sext i32 %291 to i64
  %293 = getelementptr inbounds float, float* %40, i64 %292
  %294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8
  %295 = fadd float %285, %294
  %296 = shl i32 %.098108.lcssa, 8
  %297 = or i32 %296, 3072
  %298 = add nsw i32 %297, %45
  %299 = icmp slt i32 %298, %3
  br i1 %299, label %300, label %.thread.preheader

; <label>:300:                                    ; preds = %290
  %301 = add nsw i32 %298, %47
  %302 = sext i32 %301 to i64
  %303 = getelementptr inbounds float, float* %40, i64 %302
  %304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8
  %305 = fadd float %295, %304
  %306 = shl i32 %.098108.lcssa, 8
  %307 = or i32 %306, 3328
  %308 = add nsw i32 %307, %45
  %309 = icmp slt i32 %308, %3
  br i1 %309, label %310, label %.thread.preheader

; <label>:310:                                    ; preds = %300
  %311 = add nsw i32 %308, %47
  %312 = sext i32 %311 to i64
  %313 = getelementptr inbounds float, float* %40, i64 %312
  %314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8
  %315 = fadd float %305, %314
  %316 = shl i32 %.098108.lcssa, 8
  %317 = or i32 %316, 3584
  %318 = add nsw i32 %317, %45
  %319 = icmp slt i32 %318, %3
  br i1 %319, label %320, label %.thread.preheader

; <label>:320:                                    ; preds = %310
  %321 = add nsw i32 %318, %47
  %322 = sext i32 %321 to i64
  %323 = getelementptr inbounds float, float* %40, i64 %322
  %324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8
  %325 = fadd float %315, %324
  br label %.thread.preheader
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 32
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %3, 32767
  %31 = sdiv i32 %30, 32768
  %32 = mul nsw i32 %31, %2
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = icmp slt i32 %33, %32
  br i1 %35, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
  %36 = and i32 %34, 31
  %.not = icmp ne i32 %36, 0
  %37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0
  %38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
  br label %39

._crit_edge.loopexit:                             ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit, %29
  ret void

; <label>:39:                                     ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  %.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ]
  %40 = srem i32 %.0114, %31
  %41 = sdiv i32 %.0114, %31
  %42 = shl nsw i32 %40, 15
  %43 = or i32 %42, %34
  %.idx.val = load float, float* %.idx, align 4
  %44 = icmp slt i32 %41, %2
  br i1 %44, label %.preheader102, label %.thread.preheader

.thread.preheader.loopexit:                       ; preds = %.preheader.preheader
  %.lcssa137 = phi float [ %163, %.preheader.preheader ]
  br label %.thread.preheader

.thread.preheader:                                ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39
  %.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
  br label %.thread

.preheader102:                                    ; preds = %39
  %45 = mul nsw i32 %41, %3
  %46 = add i32 %45, %43
  %47 = load float*, float** %38, align 8
  br label %48

; <label>:48:                                     ; preds = %.preheader102, %.preheader.preheader
  %.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ]
  %.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ]
  %49 = add nuw nsw i32 %.098108, 16
  %50 = shl i32 %.098108, 8
  %51 = or i32 %50, 3840
  %52 = add nsw i32 %51, %43
  %53 = icmp slt i32 %52, %3
  br i1 %53, label %.preheader.preheader, label %.preheader101

.preheader.preheader:                             ; preds = %48
  %54 = add i32 %46, %50
  %55 = sext i32 %54 to i64
  %56 = getelementptr inbounds float, float* %47, i64 %55
  %57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8
  %58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8
  %59 = shl i32 %.098108, 8
  %60 = or i32 %59, 256
  %61 = add i32 %46, %60
  %62 = sext i32 %61 to i64
  %63 = getelementptr inbounds float, float* %47, i64 %62
  %64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8
  %65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8
  %66 = shl i32 %.098108, 8
  %67 = or i32 %66, 512
  %68 = add i32 %46, %67
  %69 = sext i32 %68 to i64
  %70 = getelementptr inbounds float, float* %47, i64 %69
  %71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8
  %72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8
  %73 = shl i32 %.098108, 8
  %74 = or i32 %73, 768
  %75 = add i32 %46, %74
  %76 = sext i32 %75 to i64
  %77 = getelementptr inbounds float, float* %47, i64 %76
  %78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
  %79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8
  %80 = shl i32 %.098108, 8
  %81 = or i32 %80, 1024
  %82 = add i32 %46, %81
  %83 = sext i32 %82 to i64
  %84 = getelementptr inbounds float, float* %47, i64 %83
  %85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8
  %86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8
  %87 = shl i32 %.098108, 8
  %88 = or i32 %87, 1280
  %89 = add i32 %46, %88
  %90 = sext i32 %89 to i64
  %91 = getelementptr inbounds float, float* %47, i64 %90
  %92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8
  %93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8
  %94 = shl i32 %.098108, 8
  %95 = or i32 %94, 1536
  %96 = add i32 %46, %95
  %97 = sext i32 %96 to i64
  %98 = getelementptr inbounds float, float* %47, i64 %97
  %99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8
  %100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8
  %101 = shl i32 %.098108, 8
  %102 = or i32 %101, 1792
  %103 = add i32 %46, %102
  %104 = sext i32 %103 to i64
  %105 = getelementptr inbounds float, float* %47, i64 %104
  %106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8
  %107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8
  %108 = shl i32 %.098108, 8
  %109 = or i32 %108, 2048
  %110 = add i32 %46, %109
  %111 = sext i32 %110 to i64
  %112 = getelementptr inbounds float, float* %47, i64 %111
  %113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8
  %114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8
  %115 = shl i32 %.098108, 8
  %116 = or i32 %115, 2304
  %117 = add i32 %46, %116
  %118 = sext i32 %117 to i64
  %119 = getelementptr inbounds float, float* %47, i64 %118
  %120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8
  %121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8
  %122 = shl i32 %.098108, 8
  %123 = or i32 %122, 2560
  %124 = add i32 %46, %123
  %125 = sext i32 %124 to i64
  %126 = getelementptr inbounds float, float* %47, i64 %125
  %127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8
  %128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8
  %129 = shl i32 %.098108, 8
  %130 = or i32 %129, 2816
  %131 = add i32 %46, %130
  %132 = sext i32 %131 to i64
  %133 = getelementptr inbounds float, float* %47, i64 %132
  %134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8
  %135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8
  %136 = shl i32 %.098108, 8
  %137 = or i32 %136, 3072
  %138 = add i32 %46, %137
  %139 = sext i32 %138 to i64
  %140 = getelementptr inbounds float, float* %47, i64 %139
  %141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8
  %142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8
  %143 = shl i32 %.098108, 8
  %144 = or i32 %143, 3328
  %145 = add i32 %46, %144
  %146 = sext i32 %145 to i64
  %147 = getelementptr inbounds float, float* %47, i64 %146
  %148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8
  %149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8
  %150 = shl i32 %.098108, 8
  %151 = or i32 %150, 3584
  %152 = add i32 %46, %151
  %153 = sext i32 %152 to i64
  %154 = getelementptr inbounds float, float* %47, i64 %153
  %155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8
  %156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8
  %157 = shl i32 %.098108, 8
  %158 = or i32 %157, 3840
  %159 = add i32 %46, %158
  %160 = sext i32 %159 to i64
  %161 = getelementptr inbounds float, float* %47, i64 %160
  %162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8
  %163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8
  %164 = icmp slt i32 %49, 128
  br i1 %164, label %48, label %.thread.preheader.loopexit

.preheader101:                                    ; preds = %48
  %.lcssa = phi i32 [ %50, %48 ]
  %.098108.lcssa = phi i32 [ %.098108, %48 ]
  %.095109.lcssa = phi float [ %.095109, %48 ]
  %165 = load float*, float** %38, align 8
  %166 = add nsw i32 %.lcssa, %43
  %167 = icmp slt i32 %166, %3
  br i1 %167, label %168, label %.thread.preheader

; <label>:168:                                    ; preds = %.preheader101
  %169 = add nsw i32 %166, %45
  %170 = sext i32 %169 to i64
  %171 = getelementptr inbounds float, float* %165, i64 %170
  %172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
  %173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8
  %174 = shl i32 %.098108.lcssa, 8
  %175 = or i32 %174, 256
  %176 = add nsw i32 %175, %43
  %177 = icmp slt i32 %176, %3
  br i1 %177, label %198, label %.thread.preheader

; <label>:178:                                    ; preds = %.thread
  %.lcssa138 = phi float [ %180, %.thread ]
  %.not99 = xor i1 %44, true
  %brmerge = or i1 %.not, %.not99
  br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183

.thread:                                          ; preds = %.thread.preheader, %.thread
  %.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
  %.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
  %179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
  %180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8
  %181 = lshr i32 %.092113, 1
  %182 = icmp eq i32 %181, 0
  br i1 %182, label %178, label %.thread, !llvm.loop !74

; <label>:183:                                    ; preds = %178
  %184 = sext i32 %41 to i64
  %185 = load float*, float** %37, align 8
  %186 = getelementptr inbounds float, float* %185, i64 %184
  %187 = bitcast float %.lcssa138 to i32
  %188 = bitcast float* %186 to i32*
  %189 = load i32, i32* %188, align 4
  br label %190

; <label>:190:                                    ; preds = %193, %183
  %.011.i = phi i32 [ %189, %183 ], [ %195, %193 ]
  %191 = bitcast i32 %.011.i to float
  %192 = fcmp olt float %191, %.lcssa138
  br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit

; <label>:193:                                    ; preds = %190
  %194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst
  %195 = extractvalue { i32, i1 } %194, 0
  %not..i = icmp eq i32 %.011.i, %195
  br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193
  br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178
  %196 = add nuw nsw i32 %.0114, 32
  %197 = icmp slt i32 %196, %32
  br i1 %197, label %39, label %._crit_edge.loopexit

; <label>:198:                                    ; preds = %168
  %199 = add nsw i32 %176, %45
  %200 = sext i32 %199 to i64
  %201 = getelementptr inbounds float, float* %165, i64 %200
  %202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8
  %203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8
  %204 = shl i32 %.098108.lcssa, 8
  %205 = or i32 %204, 512
  %206 = add nsw i32 %205, %43
  %207 = icmp slt i32 %206, %3
  br i1 %207, label %208, label %.thread.preheader

; <label>:208:                                    ; preds = %198
  %209 = add nsw i32 %206, %45
  %210 = sext i32 %209 to i64
  %211 = getelementptr inbounds float, float* %165, i64 %210
  %212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8
  %213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8
  %214 = shl i32 %.098108.lcssa, 8
  %215 = or i32 %214, 768
  %216 = add nsw i32 %215, %43
  %217 = icmp slt i32 %216, %3
  br i1 %217, label %218, label %.thread.preheader

; <label>:218:                                    ; preds = %208
  %219 = add nsw i32 %216, %45
  %220 = sext i32 %219 to i64
  %221 = getelementptr inbounds float, float* %165, i64 %220
  %222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8
  %223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8
  %224 = shl i32 %.098108.lcssa, 8
  %225 = or i32 %224, 1024
  %226 = add nsw i32 %225, %43
  %227 = icmp slt i32 %226, %3
  br i1 %227, label %228, label %.thread.preheader

; <label>:228:                                    ; preds = %218
  %229 = add nsw i32 %226, %45
  %230 = sext i32 %229 to i64
  %231 = getelementptr inbounds float, float* %165, i64 %230
  %232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8
  %233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8
  %234 = shl i32 %.098108.lcssa, 8
  %235 = or i32 %234, 1280
  %236 = add nsw i32 %235, %43
  %237 = icmp slt i32 %236, %3
  br i1 %237, label %238, label %.thread.preheader

; <label>:238:                                    ; preds = %228
  %239 = add nsw i32 %236, %45
  %240 = sext i32 %239 to i64
  %241 = getelementptr inbounds float, float* %165, i64 %240
  %242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8
  %243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8
  %244 = shl i32 %.098108.lcssa, 8
  %245 = or i32 %244, 1536
  %246 = add nsw i32 %245, %43
  %247 = icmp slt i32 %246, %3
  br i1 %247, label %248, label %.thread.preheader

; <label>:248:                                    ; preds = %238
  %249 = add nsw i32 %246, %45
  %250 = sext i32 %249 to i64
  %251 = getelementptr inbounds float, float* %165, i64 %250
  %252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8
  %253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8
  %254 = shl i32 %.098108.lcssa, 8
  %255 = or i32 %254, 1792
  %256 = add nsw i32 %255, %43
  %257 = icmp slt i32 %256, %3
  br i1 %257, label %258, label %.thread.preheader

; <label>:258:                                    ; preds = %248
  %259 = add nsw i32 %256, %45
  %260 = sext i32 %259 to i64
  %261 = getelementptr inbounds float, float* %165, i64 %260
  %262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8
  %263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8
  %264 = shl i32 %.098108.lcssa, 8
  %265 = or i32 %264, 2048
  %266 = add nsw i32 %265, %43
  %267 = icmp slt i32 %266, %3
  br i1 %267, label %268, label %.thread.preheader

; <label>:268:                                    ; preds = %258
  %269 = add nsw i32 %266, %45
  %270 = sext i32 %269 to i64
  %271 = getelementptr inbounds float, float* %165, i64 %270
  %272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8
  %273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8
  %274 = shl i32 %.098108.lcssa, 8
  %275 = or i32 %274, 2304
  %276 = add nsw i32 %275, %43
  %277 = icmp slt i32 %276, %3
  br i1 %277, label %278, label %.thread.preheader

; <label>:278:                                    ; preds = %268
  %279 = add nsw i32 %276, %45
  %280 = sext i32 %279 to i64
  %281 = getelementptr inbounds float, float* %165, i64 %280
  %282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8
  %283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8
  %284 = shl i32 %.098108.lcssa, 8
  %285 = or i32 %284, 2560
  %286 = add nsw i32 %285, %43
  %287 = icmp slt i32 %286, %3
  br i1 %287, label %288, label %.thread.preheader

; <label>:288:                                    ; preds = %278
  %289 = add nsw i32 %286, %45
  %290 = sext i32 %289 to i64
  %291 = getelementptr inbounds float, float* %165, i64 %290
  %292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8
  %293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8
  %294 = shl i32 %.098108.lcssa, 8
  %295 = or i32 %294, 2816
  %296 = add nsw i32 %295, %43
  %297 = icmp slt i32 %296, %3
  br i1 %297, label %298, label %.thread.preheader

; <label>:298:                                    ; preds = %288
  %299 = add nsw i32 %296, %45
  %300 = sext i32 %299 to i64
  %301 = getelementptr inbounds float, float* %165, i64 %300
  %302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8
  %303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8
  %304 = shl i32 %.098108.lcssa, 8
  %305 = or i32 %304, 3072
  %306 = add nsw i32 %305, %43
  %307 = icmp slt i32 %306, %3
  br i1 %307, label %308, label %.thread.preheader

; <label>:308:                                    ; preds = %298
  %309 = add nsw i32 %306, %45
  %310 = sext i32 %309 to i64
  %311 = getelementptr inbounds float, float* %165, i64 %310
  %312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8
  %313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8
  %314 = shl i32 %.098108.lcssa, 8
  %315 = or i32 %314, 3328
  %316 = add nsw i32 %315, %43
  %317 = icmp slt i32 %316, %3
  br i1 %317, label %318, label %.thread.preheader

; <label>:318:                                    ; preds = %308
  %319 = add nsw i32 %316, %45
  %320 = sext i32 %319 to i64
  %321 = getelementptr inbounds float, float* %165, i64 %320
  %322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8
  %323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8
  %324 = shl i32 %.098108.lcssa, 8
  %325 = or i32 %324, 3584
  %326 = add nsw i32 %325, %43
  %327 = icmp slt i32 %326, %3
  br i1 %327, label %328, label %.thread.preheader

; <label>:328:                                    ; preds = %318
  %329 = add nsw i32 %326, %45
  %330 = sext i32 %329 to i64
  %331 = getelementptr inbounds float, float* %165, i64 %330
  %332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8
  %333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8
  br label %.thread.preheader
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 128
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %2, 15
  %31 = sdiv i32 %30, 16
  %32 = mul nsw i32 %31, %3
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = shl nuw nsw i32 %33, 8
  %36 = add nuw nsw i32 %35, %34
  %37 = icmp slt i32 %36, %32
  br i1 %37, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %38 = icmp sgt i32 %3, -1
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
  %40 = load float*, float** %39, align 8
  %41 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0
  %42 = load float*, float** %41, align 8
  br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader

.lr.ph.split.preheader:                           ; preds = %.lr.ph
  %43 = add i32 %32, -1
  %44 = sub i32 %43, %34
  %45 = sub i32 %44, %35
  %46 = lshr i32 %45, 15
  %47 = add nuw nsw i32 %46, 1
  %xtraiter = and i32 %47, 3
  %48 = icmp ult i32 %45, 98304
  br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new

.lr.ph.split.preheader.new:                       ; preds = %.lr.ph.split.preheader
  %unroll_iter = sub nsw i32 %47, %xtraiter
  br label %.lr.ph.split

.lr.ph.split.us.preheader:                        ; preds = %.lr.ph
  br label %.lr.ph.split.us

.lr.ph.split.us:                                  ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us
  %.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ]
  %49 = srem i32 %.047.us, %3
  %50 = sdiv i32 %.047.us, %3
  %51 = srem i32 %50, %31
  %52 = shl nsw i32 %51, 4
  br label %53

; <label>:53:                                     ; preds = %104, %.lr.ph.split.us
  %.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ]
  %.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ]
  %54 = add nuw nsw i32 %.04346.us.us, %52
  %55 = icmp slt i32 %54, %2
  br i1 %55, label %56, label %62

; <label>:56:                                     ; preds = %53
  %57 = mul nsw i32 %54, %3
  %58 = add nsw i32 %57, %49
  %59 = sext i32 %58 to i64
  %60 = getelementptr inbounds float, float* %40, i64 %59
  %61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8
  br label %62

; <label>:62:                                     ; preds = %56, %53
  %63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ]
  %64 = fadd float %.04445.us.us, %63
  %65 = or i32 %.04346.us.us, 1
  %66 = add nuw nsw i32 %65, %52
  %67 = icmp slt i32 %66, %2
  br i1 %67, label %98, label %104

.us-lcssa.us.us:                                  ; preds = %104
  %.lcssa = phi float [ %106, %104 ]
  %68 = sext i32 %49 to i64
  %69 = getelementptr inbounds float, float* %42, i64 %68
  %70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8
  %71 = add nuw nsw i32 %.047.us, 32768
  %72 = icmp slt i32 %71, %32
  br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit

._crit_edge.loopexit:                             ; preds = %.us-lcssa.us.us
  br label %._crit_edge

._crit_edge.loopexit59.unr-lcssa.loopexit:        ; preds = %.lr.ph.split
  %.lcssa60 = phi i32 [ %97, %.lr.ph.split ]
  br label %._crit_edge.loopexit59.unr-lcssa

._crit_edge.loopexit59.unr-lcssa:                 ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader
  %.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ]
  %lcmp.mod = icmp eq i32 %xtraiter, 0
  br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader

.lr.ph.split.epil.preheader:                      ; preds = %._crit_edge.loopexit59.unr-lcssa
  br label %.lr.ph.split.epil

.lr.ph.split.epil:                                ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader
  %.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ]
  %epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ]
  %73 = srem i32 %.047.epil, %3
  %74 = sext i32 %73 to i64
  %75 = getelementptr inbounds float, float* %42, i64 %74
  %76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8
  %77 = add nuw nsw i32 %.047.epil, 32768
  %epil.iter.sub = add i32 %epil.iter, -1
  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
  br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !75

._crit_edge.loopexit59.epilog-lcssa:              ; preds = %.lr.ph.split.epil
  br label %._crit_edge.loopexit59

._crit_edge.loopexit59:                           ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29
  ret void

.lr.ph.split:                                     ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new
  %.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ]
  %niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ]
  %78 = srem i32 %.047, %3
  %79 = sext i32 %78 to i64
  %80 = getelementptr inbounds float, float* %42, i64 %79
  %81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8
  %82 = add nuw nsw i32 %.047, 32768
  %83 = srem i32 %82, %3
  %84 = sext i32 %83 to i64
  %85 = getelementptr inbounds float, float* %42, i64 %84
  %86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8
  %87 = add nsw i32 %.047, 65536
  %88 = srem i32 %87, %3
  %89 = sext i32 %88 to i64
  %90 = getelementptr inbounds float, float* %42, i64 %89
  %91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8
  %92 = add nsw i32 %.047, 98304
  %93 = srem i32 %92, %3
  %94 = sext i32 %93 to i64
  %95 = getelementptr inbounds float, float* %42, i64 %94
  %96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8
  %97 = add nsw i32 %.047, 131072
  %niter.nsub.3 = add i32 %niter, -4
  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
  br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split

; <label>:98:                                     ; preds = %62
  %99 = mul nsw i32 %66, %3
  %100 = add nsw i32 %99, %49
  %101 = sext i32 %100 to i64
  %102 = getelementptr inbounds float, float* %40, i64 %101
  %103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8
  br label %104

; <label>:104:                                    ; preds = %98, %62
  %105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ]
  %106 = fadd float %64, %105
  %107 = add nsw i32 %.04346.us.us, 2
  %exitcond.1 = icmp eq i32 %107, 16
  br i1 %exitcond.1, label %.us-lcssa.us.us, label %53
}

; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat {
  %6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
  %7 = icmp eq i32 %6, 256
  br i1 %7, label %9, label %8

; <label>:8:                                      ; preds = %5
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:9:                                      ; preds = %5
  %10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
  %11 = icmp eq i32 %10, 1
  br i1 %11, label %13, label %12

; <label>:12:                                     ; preds = %9
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:13:                                     ; preds = %9
  %14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
  %15 = icmp eq i32 %14, 1
  br i1 %15, label %17, label %16

; <label>:16:                                     ; preds = %13
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:17:                                     ; preds = %13
  %18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
  %19 = icmp eq i32 %18, 128
  br i1 %19, label %21, label %20

; <label>:20:                                     ; preds = %17
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:21:                                     ; preds = %17
  %22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
  %23 = icmp eq i32 %22, 1
  br i1 %23, label %25, label %24

; <label>:24:                                     ; preds = %21
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:25:                                     ; preds = %21
  %26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
  %27 = icmp eq i32 %26, 1
  br i1 %27, label %29, label %28

; <label>:28:                                     ; preds = %25
  tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
  unreachable

; <label>:29:                                     ; preds = %25
  %30 = add nsw i32 %2, 15
  %31 = sdiv i32 %30, 16
  %32 = mul nsw i32 %31, %3
  %33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
  %34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
  %35 = shl nuw nsw i32 %33, 8
  %36 = add nuw nsw i32 %35, %34
  %37 = icmp slt i32 %36, %32
  br i1 %37, label %.lr.ph, label %._crit_edge

.lr.ph:                                           ; preds = %29
  %.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
  %38 = icmp sgt i32 %3, -1
  %39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
  %40 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0
  br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader

.lr.ph.split.preheader:                           ; preds = %.lr.ph
  br label %.lr.ph.split

.lr.ph.split.us.preheader:                        ; preds = %.lr.ph
  br label %.lr.ph.split.us

.lr.ph.split.us:                                  ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
  %.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ]
  %41 = srem i32 %.048.us, %3
  %42 = sdiv i32 %.048.us, %3
  %43 = srem i32 %42, %31
  %44 = shl nsw i32 %43, 4
  %.idx45.val.us = load float, float* %.idx45, align 4
  %45 = load float*, float** %39, align 8
  br label %54

; <label>:46:                                     ; preds = %49, %.us-lcssa.us.us
  %.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ]
  %47 = bitcast i32 %.011.i.us to float
  %48 = fcmp olt float %47, %.lcssa
  br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us

; <label>:49:                                     ; preds = %46
  %50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst
  %51 = extractvalue { i32, i1 } %50, 0
  %not..i.us = icmp eq i32 %.011.i.us, %51
  br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46
  %52 = add nuw nsw i32 %.048.us, 32768
  %53 = icmp slt i32 %52, %32
  br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit

; <label>:54:                                     ; preds = %112, %.lr.ph.split.us
  %.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ]
  %.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ]
  %55 = add nuw nsw i32 %.04347.us.us, %44
  %56 = icmp slt i32 %55, %2
  br i1 %56, label %57, label %63

; <label>:57:                                     ; preds = %54
  %58 = mul nsw i32 %55, %3
  %59 = add nsw i32 %58, %41
  %60 = sext i32 %59 to i64
  %61 = getelementptr inbounds float, float* %45, i64 %60
  %62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8
  br label %63

; <label>:63:                                     ; preds = %54, %57
  %64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ]
  %65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8
  %66 = or i32 %.04347.us.us, 1
  %67 = add nuw nsw i32 %66, %44
  %68 = icmp slt i32 %67, %2
  br i1 %68, label %106, label %112

.us-lcssa.us.us:                                  ; preds = %112
  %.lcssa = phi float [ %114, %112 ]
  %69 = sext i32 %41 to i64
  %70 = load float*, float** %40, align 8
  %71 = getelementptr inbounds float, float* %70, i64 %69
  %72 = bitcast float %.lcssa to i32
  %73 = bitcast float* %71 to i32*
  %74 = load i32, i32* %73, align 4
  br label %46

._crit_edge.loopexit:                             ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
  br label %._crit_edge

._crit_edge.loopexit60:                           ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  br label %._crit_edge

._crit_edge:                                      ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29
  ret void

.lr.ph.split:                                     ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
  %.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ]
  %.idx45.val = load float, float* %.idx45, align 4
  %75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8
  %76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8
  %77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8
  %78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8
  %79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8
  %80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8
  %81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8
  %82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8
  %83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8
  %84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8
  %85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8
  %86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8
  %87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8
  %88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8
  %89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8
  %90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8
  %91 = srem i32 %.048, %3
  %92 = sext i32 %91 to i64
  %93 = load float*, float** %40, align 8
  %94 = getelementptr inbounds float, float* %93, i64 %92
  %95 = bitcast float %90 to i32
  %96 = bitcast float* %94 to i32*
  %97 = load i32, i32* %96, align 4
  br label %98

; <label>:98:                                     ; preds = %101, %.lr.ph.split
  %.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ]
  %99 = bitcast i32 %.011.i to float
  %100 = fcmp olt float %99, %90
  br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit

; <label>:101:                                    ; preds = %98
  %102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst
  %103 = extractvalue { i32, i1 } %102, 0
  %not..i = icmp eq i32 %.011.i, %103
  br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98

_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101
  %104 = add nuw nsw i32 %.048, 32768
  %105 = icmp slt i32 %104, %32
  br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60

; <label>:106:                                    ; preds = %63
  %107 = mul nsw i32 %67, %3
  %108 = add nsw i32 %107, %41
  %109 = sext i32 %108 to i64
  %110 = getelementptr inbounds float, float* %45, i64 %109
  %111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8
  br label %112

; <label>:112:                                    ; preds = %106, %63
  %113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ]
  %114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8
  %115 = add nsw i32 %.04347.us.us, 2
  %exitcond.1 = icmp eq i32 %115, 16
  br i1 %exitcond.1, label %.us-lcssa.us.us, label %54
}

; Function Attrs: nounwind readnone
declare float @llvm.nvvm.fmax.f(float, float) #1

attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { convergent nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { convergent nounwind }
attributes #4 = { argmemonly nounwind readonly }
attributes #5 = { argmemonly nounwind }
attributes #6 = { convergent inlinehint noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { convergent noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { nounwind }
attributes #9 = { convergent }
attributes #10 = { convergent noreturn nounwind }

!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !38, !40, !40, !40, !40, !41, !41, !40}
!llvm.module.flags = !{!42, !43}
!llvm.ident = !{!44}
!nvvm.internalize.after.link = !{}
!nvvmir.version = !{!45}

!0 = !{void (float, i32, float*)* @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_, !"kernel", i32 1}
!1 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!2 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!3 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!4 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!5 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!6 = !{void (float, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_, !"kernel", i32 1}
!7 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!8 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!9 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!10 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!11 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!12 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!13 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!14 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!15 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1}
!16 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!17 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!18 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!19 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!20 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!21 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!22 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!23 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!24 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!25 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!26 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!27 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!28 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!29 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!30 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!31 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!32 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!33 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1}
!34 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!35 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!36 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!37 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!38 = !{null, !"align", i32 8}
!39 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!40 = !{null, !"align", i32 16}
!41 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!42 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!43 = !{i32 1, !"PIC Level", i32 2}
!44 = !{!"clang version google3-trunk (trunk r271374)"}
!45 = !{i32 1, i32 2}
!46 = !{i32 0, i32 65535}
!47 = !{i32 1, i32 1025}
!48 = !{i32 0, i32 1024}
!49 = !{i32 1, i32 65536}
!50 = distinct !{!50, !51}
!51 = !{!"llvm.loop.unroll.disable"}
!52 = distinct !{!52, !51}
!53 = !{i32 457534}
!54 = distinct !{!54, !55}
!55 = !{!"llvm.loop.unroll.enable"}
!56 = distinct !{!56, !51}
!57 = !{i32 1, i32 65}
!58 = distinct !{!58, !55}
!59 = distinct !{!59, !55}
!60 = distinct !{!60, !51}
!61 = distinct !{!61, !51}
!62 = distinct !{!62, !55}
!63 = distinct !{!63, !55}
!64 = distinct !{!64, !51}
!65 = distinct !{!65, !51}
!66 = distinct !{!66, !51}
!67 = distinct !{!67, !55}
!68 = distinct !{!68, !51}
!69 = distinct !{!69, !55}
!70 = distinct !{!70, !55}
!71 = distinct !{!71, !51}
!72 = distinct !{!72, !51}
!73 = distinct !{!73, !55}
!74 = distinct !{!74, !55}
!75 = distinct !{!75, !51}