Skip to content

Instantly share code, notes, and snippets.

/-

Created June 1, 2016 22:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/d8fa9ec0295e4ae808a8150e776b6871 to your computer and use it in GitHub Desktop.
Save anonymous/d8fa9ec0295e4ae808a8150e776b6871 to your computer and use it in GitHub Desktop.
; ModuleID = '<stdin>'
source_filename = "cxx11_tensor_reduction_cuda-sm_35.cui"
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%"struct.Eigen::internal::SumReducer" = type { i8 }
%"struct.Eigen::TensorEvaluator" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* }
%"class.Eigen::array" = type { [2 x i8] }
%"struct.Eigen::DSizes" = type { %"class.Eigen::array.0" }
%"class.Eigen::array.1" = type { [2 x i32] }
%"class.Eigen::array.2" = type { [1 x %"struct.Eigen::internal::TensorIntDivisor"] }
%"struct.Eigen::internal::TensorIntDivisor" = type { i32, i32, i32 }
%"class.Eigen::array.0" = type { [1 x i32] }
%"struct.Eigen::TensorEvaluator.3" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::DSizes.4" = type { %"class.Eigen::array.1" }
%"struct.Eigen::GpuDevice" = type { %"class.Eigen::StreamInterface"* }
%"class.Eigen::StreamInterface" = type { i32 (...)** }
%"struct.Eigen::TensorEvaluator.5" = type { %"struct.Eigen::TensorEvaluator", %"struct.Eigen::GpuDevice"*, float* }
%"struct.Eigen::internal::PtrWrapper" = type { float* }
%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" }
%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::Identity" }
%"struct.Eigen::internal::(anonymous namespace)::Identity" = type { i8 }
%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer" = type { float }
%"struct.Eigen::TensorEvaluator.6" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.8" }
%"struct.Eigen::TensorEvaluator.7" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.8" = type { %"struct.Eigen::TensorEvaluator", %"class.Eigen::TensorReductionOp", %"struct.Eigen::GpuDevice"*, float* }
%"class.Eigen::TensorReductionOp" = type <{ %"class.Eigen::TensorMap"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }>
%"class.Eigen::TensorMap" = type { float*, %"struct.Eigen::DSizes.4" }
%"struct.Eigen::TensorEvaluator.11" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator" }
%"struct.Eigen::TensorEvaluator.12" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.13" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.14" = type { %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::GpuDevice"*, float* }
%"struct.Eigen::TensorEvaluator.15" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.17" }
%"struct.Eigen::TensorEvaluator.16" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* }
%"struct.Eigen::TensorEvaluator.17" = type { %"struct.Eigen::TensorEvaluator.12", %"class.Eigen::TensorReductionOp.18", %"struct.Eigen::GpuDevice"*, float* }
%"class.Eigen::TensorReductionOp.18" = type <{ %"class.Eigen::TensorMap.20"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }>
%"class.Eigen::TensorMap.20" = type { float*, %"struct.Eigen::DSizes.4" }
%"struct.Eigen::TensorEvaluator.24" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.12" }
$_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_ = comdat any
$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any
@.str = private unnamed_addr constant [24 x i8] c"blockDim.x == BLOCK_DIM\00", align 1
@.str.1 = private unnamed_addr constant [76 x i8] c"third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@.str.2 = private unnamed_addr constant [16 x i8] c"blockDim.y == 1\00", align 1
@.str.3 = private unnamed_addr constant [16 x i8] c"blockDim.z == 1\00", align 1
@.str.4 = private unnamed_addr constant [22 x i8] c"gridDim.x == GRID_DIM\00", align 1
@.str.5 = private unnamed_addr constant [15 x i8] c"gridDim.y == 1\00", align 1
@.str.6 = private unnamed_addr constant [15 x i8] c"gridDim.z == 1\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_(float, i32, float*) #0 comdat {
%4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%6 = mul nuw nsw i32 %5, %4
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%8 = add nuw nsw i32 %6, %7
%9 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%10 = mul nuw nsw i32 %9, %5
%11 = icmp slt i32 %8, %1
br i1 %11, label %.lr.ph.preheader, label %._crit_edge
.lr.ph.preheader: ; preds = %3
br label %.lr.ph
._crit_edge.loopexit: ; preds = %.lr.ph
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %3
ret void
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph
%.012 = phi i32 [ %14, %.lr.ph ], [ %8, %.lr.ph.preheader ]
%12 = sext i32 %.012 to i64
%13 = getelementptr inbounds float, float* %2, i64 %12
store float %0, float* %13, align 4
%14 = add nsw i32 %.012, %10
%15 = icmp slt i32 %14, %1
br i1 %15, label %.lr.ph, label %._crit_edge.loopexit
}
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ctaid.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ntid.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.tid.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.nctaid.x() #1
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, float*) #2 comdat {
%5 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%6 = shl nuw nsw i32 %5, 15
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%8 = or i32 %6, %7
%9 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%10 = icmp eq i32 %9, 1
br i1 %10, label %11, label %15
; <label>:11: ; preds = %4
%12 = icmp eq i32 %8, 0
br i1 %12, label %13, label %14
; <label>:13: ; preds = %11
store float 0.000000e+00, float* %3, align 4
br label %14
; <label>:14: ; preds = %13, %11
tail call void @llvm.cuda.syncthreads()
br label %15
; <label>:15: ; preds = %14, %4
%16 = sub nsw i32 %2, %8
%17 = icmp sgt i32 %16, 32768
%..i = select i1 %17, i32 32768, i32 %16
%18 = icmp sgt i32 %16, 0
br i1 %18, label %.lr.ph, label %.preheader.preheader
.preheader.preheader.loopexit: ; preds = %.epil.preheader
%.lcssa47 = phi float [ %23, %.epil.preheader ]
br label %.preheader.preheader
.preheader.preheader: ; preds = %.preheader.preheader.loopexit, %.preheader.preheader.loopexit.unr-lcssa, %15
%.132.ph = phi float [ 0.000000e+00, %15 ], [ %.lcssa36.ph, %.preheader.preheader.loopexit.unr-lcssa ], [ %.lcssa47, %.preheader.preheader.loopexit ]
br label %.preheader
.preheader.preheader.loopexit.unr-lcssa.loopexit: ; preds = %32
%.lcssa49 = phi i32 [ %80, %32 ]
%.lcssa48 = phi float [ %79, %32 ]
br label %.preheader.preheader.loopexit.unr-lcssa
.preheader.preheader.loopexit.unr-lcssa: ; preds = %.preheader.preheader.loopexit.unr-lcssa.loopexit, %.lr.ph
%.lcssa36.ph = phi float [ undef, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
%.02535.unr = phi i32 [ 0, %.lr.ph ], [ %.lcssa49, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
%.03134.unr = phi float [ 0.000000e+00, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %.preheader.preheader, label %.epil.preheader.preheader
.epil.preheader.preheader: ; preds = %.preheader.preheader.loopexit.unr-lcssa
br label %.epil.preheader
.epil.preheader: ; preds = %.epil.preheader.preheader, %.epil.preheader
%.02535.epil = phi i32 [ %24, %.epil.preheader ], [ %.02535.unr, %.epil.preheader.preheader ]
%.03134.epil = phi float [ %23, %.epil.preheader ], [ %.03134.unr, %.epil.preheader.preheader ]
%epil.iter = phi i32 [ %epil.iter.sub, %.epil.preheader ], [ %xtraiter, %.epil.preheader.preheader ]
%19 = add nuw nsw i32 %.02535.epil, %8
%20 = sext i32 %19 to i64
%21 = getelementptr inbounds float, float* %26, i64 %20
%22 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %21, i32 4) #8
%23 = fadd float %.03134.epil, %22
%24 = add nuw nsw i32 %.02535.epil, 256
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %.preheader.preheader.loopexit, label %.epil.preheader, !llvm.loop !50
.lr.ph: ; preds = %15
%25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i64 0, i32 10, i32 0
%26 = load float*, float** %25, align 8
%27 = icmp sgt i32 %..i, 256
%smax = select i1 %27, i32 %..i, i32 256
%28 = add i32 %smax, -1
%29 = lshr i32 %28, 8
%30 = add nuw nsw i32 %29, 1
%xtraiter = and i32 %30, 7
%31 = icmp ult i32 %28, 1792
br i1 %31, label %.preheader.preheader.loopexit.unr-lcssa, label %.lr.ph.new
.lr.ph.new: ; preds = %.lr.ph
%unroll_iter = sub nsw i32 %30, %xtraiter
br label %32
; <label>:32: ; preds = %32, %.lr.ph.new
%.02535 = phi i32 [ 0, %.lr.ph.new ], [ %80, %32 ]
%.03134 = phi float [ 0.000000e+00, %.lr.ph.new ], [ %79, %32 ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.new ], [ %niter.nsub.7, %32 ]
%33 = add nuw nsw i32 %.02535, %8
%34 = sext i32 %33 to i64
%35 = getelementptr inbounds float, float* %26, i64 %34
%36 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %35, i32 4) #8
%37 = fadd float %.03134, %36
%38 = or i32 %.02535, 256
%39 = add nuw nsw i32 %38, %8
%40 = sext i32 %39 to i64
%41 = getelementptr inbounds float, float* %26, i64 %40
%42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8
%43 = fadd float %37, %42
%44 = or i32 %.02535, 512
%45 = add nuw nsw i32 %44, %8
%46 = sext i32 %45 to i64
%47 = getelementptr inbounds float, float* %26, i64 %46
%48 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %47, i32 4) #8
%49 = fadd float %43, %48
%50 = or i32 %.02535, 768
%51 = add nuw nsw i32 %50, %8
%52 = sext i32 %51 to i64
%53 = getelementptr inbounds float, float* %26, i64 %52
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
%55 = fadd float %49, %54
%56 = or i32 %.02535, 1024
%57 = add nuw nsw i32 %56, %8
%58 = sext i32 %57 to i64
%59 = getelementptr inbounds float, float* %26, i64 %58
%60 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %59, i32 4) #8
%61 = fadd float %55, %60
%62 = or i32 %.02535, 1280
%63 = add nuw nsw i32 %62, %8
%64 = sext i32 %63 to i64
%65 = getelementptr inbounds float, float* %26, i64 %64
%66 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %65, i32 4) #8
%67 = fadd float %61, %66
%68 = or i32 %.02535, 1536
%69 = add nuw nsw i32 %68, %8
%70 = sext i32 %69 to i64
%71 = getelementptr inbounds float, float* %26, i64 %70
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
%73 = fadd float %67, %72
%74 = or i32 %.02535, 1792
%75 = add nuw nsw i32 %74, %8
%76 = sext i32 %75 to i64
%77 = getelementptr inbounds float, float* %26, i64 %76
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
%79 = fadd float %73, %78
%80 = add nsw i32 %.02535, 2048
%niter.nsub.7 = add i32 %niter, -8
%niter.ncmp.7 = icmp eq i32 %niter.nsub.7, 0
br i1 %niter.ncmp.7, label %.preheader.preheader.loopexit.unr-lcssa.loopexit, label %32, !llvm.loop !52
; <label>:81: ; preds = %.preheader
%.lcssa = phi float [ %85, %.preheader ]
%82 = and i32 %7, 31
%83 = icmp eq i32 %82, 0
br i1 %83, label %88, label %90
.preheader: ; preds = %.preheader.preheader, %.preheader
%.033 = phi i32 [ %86, %.preheader ], [ 16, %.preheader.preheader ]
%.132 = phi float [ %85, %.preheader ], [ %.132.ph, %.preheader.preheader ]
%84 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.132, i32 %.033, i32 31) #3, !srcloc !53
%85 = fadd float %.132, %84
%86 = lshr i32 %.033, 1
%87 = icmp eq i32 %86, 0
br i1 %87, label %81, label %.preheader, !llvm.loop !54
; <label>:88: ; preds = %81
%89 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %3, float %.lcssa) #8
br label %90
; <label>:90: ; preds = %88, %81
ret void
}
; Function Attrs: convergent nounwind
declare void @llvm.cuda.syncthreads() #3
; Function Attrs: argmemonly nounwind readonly
declare float @llvm.nvvm.ldg.global.f.f32.p0f32(float* nocapture, i32) #4
; Function Attrs: argmemonly nounwind
declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* nocapture, float) #5
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = shl nuw nsw i32 %6, 7
%8 = add i32 %2, -1
%9 = add i32 %8, %7
%10 = udiv i32 %9, %7
%11 = mul nsw i32 %10, %3
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%13 = mul nuw nsw i32 %12, %6
%14 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%15 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%16 = icmp eq i32 %12, 1
br i1 %16, label %22, label %.preheader94
.preheader94.loopexit: ; preds = %.lr.ph109
br label %.preheader94
.preheader94: ; preds = %.preheader94.loopexit, %22, %5
%17 = icmp slt i32 %14, %11
br i1 %17, label %.lr.ph106, label %._crit_edge
.lr.ph106: ; preds = %.preheader94
%18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i64 0, i32 10, i32 0
%19 = load float*, float** %18, align 8
%20 = and i32 %15, 31
%21 = icmp eq i32 %20, 0
br label %30
; <label>:22: ; preds = %5
%23 = mul nuw nsw i32 %14, %6
%24 = add nuw nsw i32 %23, %15
%25 = icmp slt i32 %24, %3
br i1 %25, label %.lr.ph109.preheader, label %.preheader94
.lr.ph109.preheader: ; preds = %22
br label %.lr.ph109
.lr.ph109: ; preds = %.lr.ph109.preheader, %.lr.ph109
%.081107 = phi i32 [ %28, %.lr.ph109 ], [ %24, %.lr.ph109.preheader ]
%26 = sext i32 %.081107 to i64
%27 = getelementptr inbounds float, float* %4, i64 %26
store float 0.000000e+00, float* %27, align 4
%28 = add nsw i32 %.081107, %13
%29 = icmp slt i32 %28, %3
br i1 %29, label %.lr.ph109, label %.preheader94.loopexit
._crit_edge.loopexit: ; preds = %177
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %.preheader94
ret void
; <label>:30: ; preds = %.lr.ph106, %177
%.083105 = phi i32 [ %14, %.lr.ph106 ], [ %178, %177 ]
%31 = sdiv i32 %.083105, %10
%32 = icmp slt i32 %31, %3
br i1 %32, label %33, label %177
; <label>:33: ; preds = %30
%34 = srem i32 %.083105, %10
%35 = mul i32 %7, %34
%36 = add i32 %35, %15
%37 = mul nsw i32 %31, %2
%38 = add i32 %36, %37
br label %39
; <label>:39: ; preds = %33, %.preheader.preheader
%.086100 = phi i32 [ 0, %33 ], [ %40, %.preheader.preheader ]
%.09299 = phi float [ 0.000000e+00, %33 ], [ %155, %.preheader.preheader ]
%40 = add nuw nsw i32 %.086100, 16
%41 = or i32 %.086100, 15
%42 = mul i32 %41, %6
%43 = add i32 %42, %36
%44 = icmp slt i32 %43, %2
%45 = mul i32 %.086100, %6
br i1 %44, label %.preheader.preheader, label %157
.preheader.preheader: ; preds = %39
%46 = add i32 %38, %45
%47 = sext i32 %46 to i64
%48 = getelementptr inbounds float, float* %19, i64 %47
%49 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %48, i32 4) #8
%50 = fadd float %.09299, %49
%51 = or i32 %.086100, 1
%52 = mul i32 %51, %6
%53 = add i32 %38, %52
%54 = sext i32 %53 to i64
%55 = getelementptr inbounds float, float* %19, i64 %54
%56 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %55, i32 4) #8
%57 = fadd float %50, %56
%58 = or i32 %.086100, 2
%59 = mul i32 %58, %6
%60 = add i32 %38, %59
%61 = sext i32 %60 to i64
%62 = getelementptr inbounds float, float* %19, i64 %61
%63 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %62, i32 4) #8
%64 = fadd float %57, %63
%65 = or i32 %.086100, 3
%66 = mul i32 %65, %6
%67 = add i32 %38, %66
%68 = sext i32 %67 to i64
%69 = getelementptr inbounds float, float* %19, i64 %68
%70 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %69, i32 4) #8
%71 = fadd float %64, %70
%72 = or i32 %.086100, 4
%73 = mul i32 %72, %6
%74 = add i32 %38, %73
%75 = sext i32 %74 to i64
%76 = getelementptr inbounds float, float* %19, i64 %75
%77 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %76, i32 4) #8
%78 = fadd float %71, %77
%79 = or i32 %.086100, 5
%80 = mul i32 %79, %6
%81 = add i32 %38, %80
%82 = sext i32 %81 to i64
%83 = getelementptr inbounds float, float* %19, i64 %82
%84 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %83, i32 4) #8
%85 = fadd float %78, %84
%86 = or i32 %.086100, 6
%87 = mul i32 %86, %6
%88 = add i32 %38, %87
%89 = sext i32 %88 to i64
%90 = getelementptr inbounds float, float* %19, i64 %89
%91 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %90, i32 4) #8
%92 = fadd float %85, %91
%93 = or i32 %.086100, 7
%94 = mul i32 %93, %6
%95 = add i32 %38, %94
%96 = sext i32 %95 to i64
%97 = getelementptr inbounds float, float* %19, i64 %96
%98 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %97, i32 4) #8
%99 = fadd float %92, %98
%100 = or i32 %.086100, 8
%101 = mul i32 %100, %6
%102 = add i32 %38, %101
%103 = sext i32 %102 to i64
%104 = getelementptr inbounds float, float* %19, i64 %103
%105 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %104, i32 4) #8
%106 = fadd float %99, %105
%107 = or i32 %.086100, 9
%108 = mul i32 %107, %6
%109 = add i32 %38, %108
%110 = sext i32 %109 to i64
%111 = getelementptr inbounds float, float* %19, i64 %110
%112 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %111, i32 4) #8
%113 = fadd float %106, %112
%114 = or i32 %.086100, 10
%115 = mul i32 %114, %6
%116 = add i32 %38, %115
%117 = sext i32 %116 to i64
%118 = getelementptr inbounds float, float* %19, i64 %117
%119 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %118, i32 4) #8
%120 = fadd float %113, %119
%121 = or i32 %.086100, 11
%122 = mul i32 %121, %6
%123 = add i32 %38, %122
%124 = sext i32 %123 to i64
%125 = getelementptr inbounds float, float* %19, i64 %124
%126 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %125, i32 4) #8
%127 = fadd float %120, %126
%128 = or i32 %.086100, 12
%129 = mul i32 %128, %6
%130 = add i32 %38, %129
%131 = sext i32 %130 to i64
%132 = getelementptr inbounds float, float* %19, i64 %131
%133 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %132, i32 4) #8
%134 = fadd float %127, %133
%135 = or i32 %.086100, 13
%136 = mul i32 %135, %6
%137 = add i32 %38, %136
%138 = sext i32 %137 to i64
%139 = getelementptr inbounds float, float* %19, i64 %138
%140 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %139, i32 4) #8
%141 = fadd float %134, %140
%142 = or i32 %.086100, 14
%143 = mul i32 %142, %6
%144 = add i32 %38, %143
%145 = sext i32 %144 to i64
%146 = getelementptr inbounds float, float* %19, i64 %145
%147 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %146, i32 4) #8
%148 = fadd float %141, %147
%149 = or i32 %.086100, 15
%150 = mul i32 %149, %6
%151 = add i32 %38, %150
%152 = sext i32 %151 to i64
%153 = getelementptr inbounds float, float* %19, i64 %152
%154 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %153, i32 4) #8
%155 = fadd float %148, %154
%156 = icmp slt i32 %40, 128
br i1 %156, label %39, label %.critedge.loopexit125
; <label>:157: ; preds = %39
%.lcssa = phi i32 [ %45, %39 ]
%.09299.lcssa = phi float [ %.09299, %39 ]
%158 = add i32 %.lcssa, %36
%159 = icmp slt i32 %158, %2
br i1 %159, label %.lr.ph.preheader, label %.critedge
.lr.ph.preheader: ; preds = %157
br label %.lr.ph
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph
%.084102 = phi i32 [ %165, %.lr.ph ], [ %158, %.lr.ph.preheader ]
%.1101 = phi float [ %164, %.lr.ph ], [ %.09299.lcssa, %.lr.ph.preheader ]
%160 = add nsw i32 %.084102, %37
%161 = sext i32 %160 to i64
%162 = getelementptr inbounds float, float* %19, i64 %161
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
%164 = fadd float %.1101, %163
%165 = add i32 %.084102, %6
%166 = icmp slt i32 %165, %2
br i1 %166, label %.lr.ph, label %.critedge.loopexit
.critedge.loopexit: ; preds = %.lr.ph
%.lcssa134 = phi float [ %164, %.lr.ph ]
br label %.critedge
.critedge.loopexit125: ; preds = %.preheader.preheader
%.lcssa133 = phi float [ %155, %.preheader.preheader ]
br label %.critedge
.critedge: ; preds = %.critedge.loopexit125, %.critedge.loopexit, %157
%.3 = phi float [ %.09299.lcssa, %157 ], [ %.lcssa134, %.critedge.loopexit ], [ %.lcssa133, %.critedge.loopexit125 ]
tail call void @llvm.cuda.syncthreads()
br label %168
; <label>:167: ; preds = %168
%.lcssa135 = phi float [ %170, %168 ]
br i1 %21, label %173, label %177
; <label>:168: ; preds = %.critedge, %168
%.0104 = phi i32 [ 16, %.critedge ], [ %171, %168 ]
%.4103 = phi float [ %.3, %.critedge ], [ %170, %168 ]
%169 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.4103, i32 %.0104, i32 31) #3, !srcloc !53
%170 = fadd float %.4103, %169
%171 = lshr i32 %.0104, 1
%172 = icmp eq i32 %171, 0
br i1 %172, label %167, label %168
; <label>:173: ; preds = %167
%174 = sext i32 %31 to i64
%175 = getelementptr inbounds float, float* %4, i64 %174
%176 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %175, float %.lcssa135) #8
br label %177
; <label>:177: ; preds = %167, %173, %30
tail call void @llvm.cuda.syncthreads()
%178 = add i32 %.083105, %12
%179 = icmp slt i32 %178, %11
br i1 %179, label %30, label %._crit_edge.loopexit
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%8 = mul nuw nsw i32 %7, %6
%9 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%10 = mul nuw nsw i32 %9, %6
%11 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%12 = add nuw nsw i32 %10, %11
%13 = icmp eq i32 %7, 1
br i1 %13, label %.preheader, label %19
.preheader: ; preds = %5
%14 = icmp slt i32 %12, %3
br i1 %14, label %.lr.ph60.preheader, label %._crit_edge61
.lr.ph60.preheader: ; preds = %.preheader
br label %.lr.ph60
._crit_edge61.loopexit: ; preds = %.lr.ph60
br label %._crit_edge61
._crit_edge61: ; preds = %._crit_edge61.loopexit, %.preheader
tail call void @llvm.cuda.syncthreads()
br label %19
.lr.ph60: ; preds = %.lr.ph60.preheader, %.lr.ph60
%.059 = phi i32 [ %17, %.lr.ph60 ], [ %12, %.lr.ph60.preheader ]
%15 = sext i32 %.059 to i64
%16 = getelementptr inbounds float, float* %4, i64 %15
store float 0.000000e+00, float* %16, align 4
%17 = add nsw i32 %.059, %8
%18 = icmp slt i32 %17, %3
br i1 %18, label %.lr.ph60, label %._crit_edge61.loopexit
; <label>:19: ; preds = %._crit_edge61, %5
%20 = add i32 %2, 15
%21 = sdiv i32 %20, 16
%22 = mul nsw i32 %21, %3
%23 = icmp slt i32 %12, %22
br i1 %23, label %.lr.ph57, label %._crit_edge58
.lr.ph57: ; preds = %19
%24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i64 0, i32 10, i32 0
%25 = load float*, float** %24, align 8
br label %26
._crit_edge58.loopexit: ; preds = %._crit_edge
br label %._crit_edge58
._crit_edge58: ; preds = %._crit_edge58.loopexit, %19
ret void
; <label>:26: ; preds = %.lr.ph57, %._crit_edge
%.04755 = phi i32 [ %12, %.lr.ph57 ], [ %36, %._crit_edge ]
%27 = srem i32 %.04755, %3
%28 = sdiv i32 %.04755, %3
%29 = shl nsw i32 %28, 4
%30 = add nsw i32 %29, 16
%31 = icmp sgt i32 %30, %2
%..i = select i1 %31, i32 %2, i32 %30
%32 = icmp slt i32 %29, %..i
br i1 %32, label %.lr.ph.preheader, label %._crit_edge
.lr.ph.preheader: ; preds = %26
br label %.lr.ph
._crit_edge.loopexit: ; preds = %.lr.ph
%.lcssa = phi float [ %43, %.lr.ph ]
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %26
%.052.lcssa = phi float [ 0.000000e+00, %26 ], [ %.lcssa, %._crit_edge.loopexit ]
%33 = sext i32 %27 to i64
%34 = getelementptr inbounds float, float* %4, i64 %33
%35 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %34, float %.052.lcssa) #8
%36 = add nsw i32 %.04755, %8
%37 = icmp slt i32 %36, %22
br i1 %37, label %26, label %._crit_edge58.loopexit
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph
%.04654 = phi i32 [ %44, %.lr.ph ], [ %29, %.lr.ph.preheader ]
%.05253 = phi float [ %43, %.lr.ph ], [ 0.000000e+00, %.lr.ph.preheader ]
%38 = mul nsw i32 %.04654, %3
%39 = add nsw i32 %38, %27
%40 = sext i32 %39 to i64
%41 = getelementptr inbounds float, float* %25, i64 %40
%42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8
%43 = fadd float %.05253, %42
%44 = add nsw i32 %.04654, 1
%45 = icmp slt i32 %44, %..i
br i1 %45, label %.lr.ph, label %._crit_edge.loopexit
}
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.5"* byval align 8, i32) #0 comdat {
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%5 = mul nuw nsw i32 %4, %3
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%7 = add nuw nsw i32 %5, %6
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%9 = mul nuw nsw i32 %8, %4
%.sroa.444.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 0, i32 7
%.sroa.444.0..sroa_cast = bitcast i32* %.sroa.444.0..sroa_idx to i64*
%.sroa.444.0.copyload = load i64, i64* %.sroa.444.0..sroa_cast, align 8
%.sroa.546.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 0, i32 9, i32 0, i64 0
%.sroa.546.0.copyload = load i32, i32* %.sroa.546.0..sroa_idx, align 8
%.sroa.750.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 0, i32 10, i32 0
%.sroa.750.0.copyload = load float*, float** %.sroa.750.0..sroa_idx, align 8
%.sroa.9.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 2
%.sroa.9.0.copyload = load float*, float** %.sroa.9.0..sroa_idx, align 8
%10 = icmp slt i32 %7, %1
br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit
.lr.ph.i: ; preds = %2
%11 = trunc i64 %.sroa.444.0.copyload to i32
%12 = icmp sgt i32 %.sroa.546.0.copyload, 0
%13 = lshr i64 %.sroa.444.0.copyload, 32
%14 = trunc i64 %13 to i32
br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader
.lr.ph.split.i.preheader: ; preds = %.lr.ph.i
br label %.lr.ph.split.i
.lr.ph.split.us.i.preheader: ; preds = %.lr.ph.i
%15 = add i32 %.sroa.546.0.copyload, -1
%xtraiter = and i32 %.sroa.546.0.copyload, 3
%16 = icmp ult i32 %15, 3
%lcmp.mod = icmp eq i32 %xtraiter, 0
%unroll_iter = sub i32 %.sroa.546.0.copyload, %xtraiter
br label %.lr.ph.split.us.i
.lr.ph.split.us.i: ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
%.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ]
%17 = mul nsw i32 %.07.us.i, %11
br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new
.lr.ph.split.us.i.new: ; preds = %.lr.ph.split.us.i
br label %18
; <label>:18: ; preds = %18, %.lr.ph.split.us.i.new
%19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ]
%.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ]
%20 = mul nsw i32 %.012.i.i.i.us.i, %14
%21 = add nsw i32 %20, %17
%22 = sext i32 %21 to i64
%23 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %22
%24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8
%25 = fadd float %19, %24
%26 = or i32 %.012.i.i.i.us.i, 1
%27 = mul nsw i32 %26, %14
%28 = add nsw i32 %27, %17
%29 = sext i32 %28 to i64
%30 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %29
%31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8
%32 = fadd float %25, %31
%33 = or i32 %.012.i.i.i.us.i, 2
%34 = mul nsw i32 %33, %14
%35 = add nsw i32 %34, %17
%36 = sext i32 %35 to i64
%37 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %36
%38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8
%39 = fadd float %32, %38
%40 = or i32 %.012.i.i.i.us.i, 3
%41 = mul nsw i32 %40, %14
%42 = add nsw i32 %41, %17
%43 = sext i32 %42 to i64
%44 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %43
%45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8
%46 = fadd float %39, %45
%47 = add nsw i32 %.012.i.i.i.us.i, 4
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18
%.lcssa66 = phi i32 [ %47, %18 ]
%.lcssa65 = phi float [ %46, %18 ]
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i
%.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
%.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
%.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader
.epil.preheader: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
br label %48
; <label>:48: ; preds = %48, %.epil.preheader
%49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ]
%.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ]
%epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ]
%50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14
%51 = add nsw i32 %50, %17
%52 = sext i32 %51 to i64
%53 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %52
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
%55 = fadd float %49, %54
%56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !56
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48
%.lcssa67 = phi float [ %55, %48 ]
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa
%.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ]
%57 = sext i32 %.07.us.i to i64
%58 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %57
store float %.lcssa, float* %58, align 4
%59 = add nsw i32 %.07.us.i, %9
%60 = icmp slt i32 %59, %1
br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit
.lr.ph.split.i: ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i
%.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ]
%61 = sext i32 %.07.i to i64
%62 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %61
store float 0.000000e+00, float* %62, align 4
%63 = add nsw i32 %.07.i, %9
%64 = icmp slt i32 %63, %1
br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63: ; preds = %.lr.ph.split.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit, %2
ret void
}
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_(float, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat {
%4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%6 = mul nuw nsw i32 %5, %4
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%8 = add nuw nsw i32 %6, %7
%9 = icmp slt i32 %8, %1
br i1 %9, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %3
%10 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %2, i64 0, i32 0
%11 = load float*, float** %10, align 8
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%13 = mul nuw nsw i32 %12, %5
br label %14
._crit_edge.loopexit: ; preds = %14
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %3
ret void
; <label>:14: ; preds = %.lr.ph, %14
%.08 = phi i32 [ %8, %.lr.ph ], [ %17, %14 ]
%15 = sext i32 %.08 to i64
%16 = getelementptr inbounds float, float* %11, i64 %15
store float %0, float* %16, align 4
%17 = add i32 %13, %.08
%18 = icmp slt i32 %17, %1
br i1 %18, label %14, label %._crit_edge.loopexit
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 32
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %3, 32767
%31 = sdiv i32 %30, 32768
%32 = mul nsw i32 %31, %2
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = icmp slt i32 %33, %32
br i1 %35, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%36 = and i32 %34, 31
%.not = icmp ne i32 %36, 0
%37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
%38 = load float*, float** %37, align 8
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
%40 = load float*, float** %39, align 8
br label %41
._crit_edge.loopexit: ; preds = %187
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %29
ret void
; <label>:41: ; preds = %.lr.ph, %187
%.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ]
%42 = srem i32 %.0114, %31
%43 = sdiv i32 %.0114, %31
%44 = shl nsw i32 %42, 15
%45 = or i32 %44, %34
%46 = icmp slt i32 %43, %2
br i1 %46, label %.preheader102, label %.thread.preheader
.thread.preheader.loopexit: ; preds = %.preheader.preheader
%.lcssa137 = phi float [ %164, %.preheader.preheader ]
br label %.thread.preheader
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41
%.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
br label %.thread
.preheader102: ; preds = %41
%47 = mul nsw i32 %43, %3
%48 = add i32 %47, %45
br label %49
; <label>:49: ; preds = %.preheader102, %.preheader.preheader
%.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ]
%.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ]
%50 = add nuw nsw i32 %.098108, 16
%51 = shl i32 %.098108, 8
%52 = or i32 %51, 3840
%53 = add nsw i32 %52, %45
%54 = icmp slt i32 %53, %3
br i1 %54, label %.preheader.preheader, label %.preheader101
.preheader.preheader: ; preds = %49
%55 = add i32 %48, %51
%56 = sext i32 %55 to i64
%57 = getelementptr inbounds float, float* %40, i64 %56
%58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8
%59 = fadd float %.095109, %58
%60 = shl i32 %.098108, 8
%61 = or i32 %60, 256
%62 = add i32 %48, %61
%63 = sext i32 %62 to i64
%64 = getelementptr inbounds float, float* %40, i64 %63
%65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8
%66 = fadd float %59, %65
%67 = shl i32 %.098108, 8
%68 = or i32 %67, 512
%69 = add i32 %48, %68
%70 = sext i32 %69 to i64
%71 = getelementptr inbounds float, float* %40, i64 %70
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
%73 = fadd float %66, %72
%74 = shl i32 %.098108, 8
%75 = or i32 %74, 768
%76 = add i32 %48, %75
%77 = sext i32 %76 to i64
%78 = getelementptr inbounds float, float* %40, i64 %77
%79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8
%80 = fadd float %73, %79
%81 = shl i32 %.098108, 8
%82 = or i32 %81, 1024
%83 = add i32 %48, %82
%84 = sext i32 %83 to i64
%85 = getelementptr inbounds float, float* %40, i64 %84
%86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8
%87 = fadd float %80, %86
%88 = shl i32 %.098108, 8
%89 = or i32 %88, 1280
%90 = add i32 %48, %89
%91 = sext i32 %90 to i64
%92 = getelementptr inbounds float, float* %40, i64 %91
%93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8
%94 = fadd float %87, %93
%95 = shl i32 %.098108, 8
%96 = or i32 %95, 1536
%97 = add i32 %48, %96
%98 = sext i32 %97 to i64
%99 = getelementptr inbounds float, float* %40, i64 %98
%100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8
%101 = fadd float %94, %100
%102 = shl i32 %.098108, 8
%103 = or i32 %102, 1792
%104 = add i32 %48, %103
%105 = sext i32 %104 to i64
%106 = getelementptr inbounds float, float* %40, i64 %105
%107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8
%108 = fadd float %101, %107
%109 = shl i32 %.098108, 8
%110 = or i32 %109, 2048
%111 = add i32 %48, %110
%112 = sext i32 %111 to i64
%113 = getelementptr inbounds float, float* %40, i64 %112
%114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8
%115 = fadd float %108, %114
%116 = shl i32 %.098108, 8
%117 = or i32 %116, 2304
%118 = add i32 %48, %117
%119 = sext i32 %118 to i64
%120 = getelementptr inbounds float, float* %40, i64 %119
%121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8
%122 = fadd float %115, %121
%123 = shl i32 %.098108, 8
%124 = or i32 %123, 2560
%125 = add i32 %48, %124
%126 = sext i32 %125 to i64
%127 = getelementptr inbounds float, float* %40, i64 %126
%128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8
%129 = fadd float %122, %128
%130 = shl i32 %.098108, 8
%131 = or i32 %130, 2816
%132 = add i32 %48, %131
%133 = sext i32 %132 to i64
%134 = getelementptr inbounds float, float* %40, i64 %133
%135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8
%136 = fadd float %129, %135
%137 = shl i32 %.098108, 8
%138 = or i32 %137, 3072
%139 = add i32 %48, %138
%140 = sext i32 %139 to i64
%141 = getelementptr inbounds float, float* %40, i64 %140
%142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8
%143 = fadd float %136, %142
%144 = shl i32 %.098108, 8
%145 = or i32 %144, 3328
%146 = add i32 %48, %145
%147 = sext i32 %146 to i64
%148 = getelementptr inbounds float, float* %40, i64 %147
%149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8
%150 = fadd float %143, %149
%151 = shl i32 %.098108, 8
%152 = or i32 %151, 3584
%153 = add i32 %48, %152
%154 = sext i32 %153 to i64
%155 = getelementptr inbounds float, float* %40, i64 %154
%156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8
%157 = fadd float %150, %156
%158 = shl i32 %.098108, 8
%159 = or i32 %158, 3840
%160 = add i32 %48, %159
%161 = sext i32 %160 to i64
%162 = getelementptr inbounds float, float* %40, i64 %161
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
%164 = fadd float %157, %163
%165 = icmp slt i32 %50, 128
br i1 %165, label %49, label %.thread.preheader.loopexit
.preheader101: ; preds = %49
%.lcssa = phi i32 [ %51, %49 ]
%.098108.lcssa = phi i32 [ %.098108, %49 ]
%.095109.lcssa = phi float [ %.095109, %49 ]
%166 = add nsw i32 %.lcssa, %45
%167 = icmp slt i32 %166, %3
br i1 %167, label %168, label %.thread.preheader
; <label>:168: ; preds = %.preheader101
%169 = add nsw i32 %166, %47
%170 = sext i32 %169 to i64
%171 = getelementptr inbounds float, float* %40, i64 %170
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
%173 = fadd float %.095109.lcssa, %172
%174 = shl i32 %.098108.lcssa, 8
%175 = or i32 %174, 256
%176 = add nsw i32 %175, %45
%177 = icmp slt i32 %176, %3
br i1 %177, label %190, label %.thread.preheader
; <label>:178: ; preds = %.thread
%.lcssa138 = phi float [ %180, %.thread ]
%.not99 = xor i1 %46, true
%brmerge = or i1 %.not, %.not99
br i1 %brmerge, label %187, label %183
.thread: ; preds = %.thread.preheader, %.thread
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
%180 = fadd float %.8112, %179
%181 = lshr i32 %.092113, 1
%182 = icmp eq i32 %181, 0
br i1 %182, label %178, label %.thread, !llvm.loop !58
; <label>:183: ; preds = %178
%184 = sext i32 %43 to i64
%185 = getelementptr inbounds float, float* %38, i64 %184
%186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8
br label %187
; <label>:187: ; preds = %178, %183
%188 = add nuw nsw i32 %.0114, 32
%189 = icmp slt i32 %188, %32
br i1 %189, label %41, label %._crit_edge.loopexit
; <label>:190: ; preds = %168
%191 = add nsw i32 %176, %47
%192 = sext i32 %191 to i64
%193 = getelementptr inbounds float, float* %40, i64 %192
%194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8
%195 = fadd float %173, %194
%196 = shl i32 %.098108.lcssa, 8
%197 = or i32 %196, 512
%198 = add nsw i32 %197, %45
%199 = icmp slt i32 %198, %3
br i1 %199, label %200, label %.thread.preheader
; <label>:200: ; preds = %190
%201 = add nsw i32 %198, %47
%202 = sext i32 %201 to i64
%203 = getelementptr inbounds float, float* %40, i64 %202
%204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8
%205 = fadd float %195, %204
%206 = shl i32 %.098108.lcssa, 8
%207 = or i32 %206, 768
%208 = add nsw i32 %207, %45
%209 = icmp slt i32 %208, %3
br i1 %209, label %210, label %.thread.preheader
; <label>:210: ; preds = %200
%211 = add nsw i32 %208, %47
%212 = sext i32 %211 to i64
%213 = getelementptr inbounds float, float* %40, i64 %212
%214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8
%215 = fadd float %205, %214
%216 = shl i32 %.098108.lcssa, 8
%217 = or i32 %216, 1024
%218 = add nsw i32 %217, %45
%219 = icmp slt i32 %218, %3
br i1 %219, label %220, label %.thread.preheader
; <label>:220: ; preds = %210
%221 = add nsw i32 %218, %47
%222 = sext i32 %221 to i64
%223 = getelementptr inbounds float, float* %40, i64 %222
%224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8
%225 = fadd float %215, %224
%226 = shl i32 %.098108.lcssa, 8
%227 = or i32 %226, 1280
%228 = add nsw i32 %227, %45
%229 = icmp slt i32 %228, %3
br i1 %229, label %230, label %.thread.preheader
; <label>:230: ; preds = %220
%231 = add nsw i32 %228, %47
%232 = sext i32 %231 to i64
%233 = getelementptr inbounds float, float* %40, i64 %232
%234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8
%235 = fadd float %225, %234
%236 = shl i32 %.098108.lcssa, 8
%237 = or i32 %236, 1536
%238 = add nsw i32 %237, %45
%239 = icmp slt i32 %238, %3
br i1 %239, label %240, label %.thread.preheader
; <label>:240: ; preds = %230
%241 = add nsw i32 %238, %47
%242 = sext i32 %241 to i64
%243 = getelementptr inbounds float, float* %40, i64 %242
%244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8
%245 = fadd float %235, %244
%246 = shl i32 %.098108.lcssa, 8
%247 = or i32 %246, 1792
%248 = add nsw i32 %247, %45
%249 = icmp slt i32 %248, %3
br i1 %249, label %250, label %.thread.preheader
; <label>:250: ; preds = %240
%251 = add nsw i32 %248, %47
%252 = sext i32 %251 to i64
%253 = getelementptr inbounds float, float* %40, i64 %252
%254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8
%255 = fadd float %245, %254
%256 = shl i32 %.098108.lcssa, 8
%257 = or i32 %256, 2048
%258 = add nsw i32 %257, %45
%259 = icmp slt i32 %258, %3
br i1 %259, label %260, label %.thread.preheader
; <label>:260: ; preds = %250
%261 = add nsw i32 %258, %47
%262 = sext i32 %261 to i64
%263 = getelementptr inbounds float, float* %40, i64 %262
%264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8
%265 = fadd float %255, %264
%266 = shl i32 %.098108.lcssa, 8
%267 = or i32 %266, 2304
%268 = add nsw i32 %267, %45
%269 = icmp slt i32 %268, %3
br i1 %269, label %270, label %.thread.preheader
; <label>:270: ; preds = %260
%271 = add nsw i32 %268, %47
%272 = sext i32 %271 to i64
%273 = getelementptr inbounds float, float* %40, i64 %272
%274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8
%275 = fadd float %265, %274
%276 = shl i32 %.098108.lcssa, 8
%277 = or i32 %276, 2560
%278 = add nsw i32 %277, %45
%279 = icmp slt i32 %278, %3
br i1 %279, label %280, label %.thread.preheader
; <label>:280: ; preds = %270
%281 = add nsw i32 %278, %47
%282 = sext i32 %281 to i64
%283 = getelementptr inbounds float, float* %40, i64 %282
%284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8
%285 = fadd float %275, %284
%286 = shl i32 %.098108.lcssa, 8
%287 = or i32 %286, 2816
%288 = add nsw i32 %287, %45
%289 = icmp slt i32 %288, %3
br i1 %289, label %290, label %.thread.preheader
; <label>:290: ; preds = %280
%291 = add nsw i32 %288, %47
%292 = sext i32 %291 to i64
%293 = getelementptr inbounds float, float* %40, i64 %292
%294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8
%295 = fadd float %285, %294
%296 = shl i32 %.098108.lcssa, 8
%297 = or i32 %296, 3072
%298 = add nsw i32 %297, %45
%299 = icmp slt i32 %298, %3
br i1 %299, label %300, label %.thread.preheader
; <label>:300: ; preds = %290
%301 = add nsw i32 %298, %47
%302 = sext i32 %301 to i64
%303 = getelementptr inbounds float, float* %40, i64 %302
%304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8
%305 = fadd float %295, %304
%306 = shl i32 %.098108.lcssa, 8
%307 = or i32 %306, 3328
%308 = add nsw i32 %307, %45
%309 = icmp slt i32 %308, %3
br i1 %309, label %310, label %.thread.preheader
; <label>:310: ; preds = %300
%311 = add nsw i32 %308, %47
%312 = sext i32 %311 to i64
%313 = getelementptr inbounds float, float* %40, i64 %312
%314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8
%315 = fadd float %305, %314
%316 = shl i32 %.098108.lcssa, 8
%317 = or i32 %316, 3584
%318 = add nsw i32 %317, %45
%319 = icmp slt i32 %318, %3
br i1 %319, label %320, label %.thread.preheader
; <label>:320: ; preds = %310
%321 = add nsw i32 %318, %47
%322 = sext i32 %321 to i64
%323 = getelementptr inbounds float, float* %40, i64 %322
%324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8
%325 = fadd float %315, %324
br label %.thread.preheader
}
; Function Attrs: convergent inlinehint noreturn nounwind
define internal fastcc void @_ZL13__assert_failPKcS0_jS0_(i8*, i32, i8*) unnamed_addr #6 {
tail call void @__assertfail(i8* %0, i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i64 0, i64 0), i32 %1, i8* %2, i64 1) #10
unreachable
}
; Function Attrs: convergent noreturn
declare void @__assertfail(i8*, i8*, i32, i8*, i64) #7
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ntid.y() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.ntid.z() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.nctaid.y() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.ptx.read.nctaid.z() #1
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 32
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %3, 32767
%31 = sdiv i32 %30, 32768
%32 = mul nsw i32 %31, %2
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = icmp slt i32 %33, %32
br i1 %35, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
%36 = and i32 %34, 31
%.not = icmp ne i32 %36, 0
%37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
%38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
br label %39
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %29
ret void
; <label>:39: ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
%.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ]
%40 = srem i32 %.0114, %31
%41 = sdiv i32 %.0114, %31
%42 = shl nsw i32 %40, 15
%43 = or i32 %42, %34
%.idx.val = load float, float* %.idx, align 4
%44 = icmp slt i32 %41, %2
br i1 %44, label %.preheader102, label %.thread.preheader
.thread.preheader.loopexit: ; preds = %.preheader.preheader
%.lcssa137 = phi float [ %163, %.preheader.preheader ]
br label %.thread.preheader
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39
%.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
br label %.thread
.preheader102: ; preds = %39
%45 = mul nsw i32 %41, %3
%46 = add i32 %45, %43
%47 = load float*, float** %38, align 8
br label %48
; <label>:48: ; preds = %.preheader102, %.preheader.preheader
%.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ]
%.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ]
%49 = add nuw nsw i32 %.098108, 16
%50 = shl i32 %.098108, 8
%51 = or i32 %50, 3840
%52 = add nsw i32 %51, %43
%53 = icmp slt i32 %52, %3
br i1 %53, label %.preheader.preheader, label %.preheader101
.preheader.preheader: ; preds = %48
%54 = add i32 %46, %50
%55 = sext i32 %54 to i64
%56 = getelementptr inbounds float, float* %47, i64 %55
%57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8
%58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8
%59 = shl i32 %.098108, 8
%60 = or i32 %59, 256
%61 = add i32 %46, %60
%62 = sext i32 %61 to i64
%63 = getelementptr inbounds float, float* %47, i64 %62
%64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8
%65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8
%66 = shl i32 %.098108, 8
%67 = or i32 %66, 512
%68 = add i32 %46, %67
%69 = sext i32 %68 to i64
%70 = getelementptr inbounds float, float* %47, i64 %69
%71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8
%72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8
%73 = shl i32 %.098108, 8
%74 = or i32 %73, 768
%75 = add i32 %46, %74
%76 = sext i32 %75 to i64
%77 = getelementptr inbounds float, float* %47, i64 %76
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
%79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8
%80 = shl i32 %.098108, 8
%81 = or i32 %80, 1024
%82 = add i32 %46, %81
%83 = sext i32 %82 to i64
%84 = getelementptr inbounds float, float* %47, i64 %83
%85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8
%86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8
%87 = shl i32 %.098108, 8
%88 = or i32 %87, 1280
%89 = add i32 %46, %88
%90 = sext i32 %89 to i64
%91 = getelementptr inbounds float, float* %47, i64 %90
%92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8
%93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8
%94 = shl i32 %.098108, 8
%95 = or i32 %94, 1536
%96 = add i32 %46, %95
%97 = sext i32 %96 to i64
%98 = getelementptr inbounds float, float* %47, i64 %97
%99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8
%100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8
%101 = shl i32 %.098108, 8
%102 = or i32 %101, 1792
%103 = add i32 %46, %102
%104 = sext i32 %103 to i64
%105 = getelementptr inbounds float, float* %47, i64 %104
%106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8
%107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8
%108 = shl i32 %.098108, 8
%109 = or i32 %108, 2048
%110 = add i32 %46, %109
%111 = sext i32 %110 to i64
%112 = getelementptr inbounds float, float* %47, i64 %111
%113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8
%114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8
%115 = shl i32 %.098108, 8
%116 = or i32 %115, 2304
%117 = add i32 %46, %116
%118 = sext i32 %117 to i64
%119 = getelementptr inbounds float, float* %47, i64 %118
%120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8
%121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8
%122 = shl i32 %.098108, 8
%123 = or i32 %122, 2560
%124 = add i32 %46, %123
%125 = sext i32 %124 to i64
%126 = getelementptr inbounds float, float* %47, i64 %125
%127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8
%128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8
%129 = shl i32 %.098108, 8
%130 = or i32 %129, 2816
%131 = add i32 %46, %130
%132 = sext i32 %131 to i64
%133 = getelementptr inbounds float, float* %47, i64 %132
%134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8
%135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8
%136 = shl i32 %.098108, 8
%137 = or i32 %136, 3072
%138 = add i32 %46, %137
%139 = sext i32 %138 to i64
%140 = getelementptr inbounds float, float* %47, i64 %139
%141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8
%142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8
%143 = shl i32 %.098108, 8
%144 = or i32 %143, 3328
%145 = add i32 %46, %144
%146 = sext i32 %145 to i64
%147 = getelementptr inbounds float, float* %47, i64 %146
%148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8
%149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8
%150 = shl i32 %.098108, 8
%151 = or i32 %150, 3584
%152 = add i32 %46, %151
%153 = sext i32 %152 to i64
%154 = getelementptr inbounds float, float* %47, i64 %153
%155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8
%156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8
%157 = shl i32 %.098108, 8
%158 = or i32 %157, 3840
%159 = add i32 %46, %158
%160 = sext i32 %159 to i64
%161 = getelementptr inbounds float, float* %47, i64 %160
%162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8
%163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8
%164 = icmp slt i32 %49, 128
br i1 %164, label %48, label %.thread.preheader.loopexit
.preheader101: ; preds = %48
%.lcssa = phi i32 [ %50, %48 ]
%.098108.lcssa = phi i32 [ %.098108, %48 ]
%.095109.lcssa = phi float [ %.095109, %48 ]
%165 = load float*, float** %38, align 8
%166 = add nsw i32 %.lcssa, %43
%167 = icmp slt i32 %166, %3
br i1 %167, label %168, label %.thread.preheader
; <label>:168: ; preds = %.preheader101
%169 = add nsw i32 %166, %45
%170 = sext i32 %169 to i64
%171 = getelementptr inbounds float, float* %165, i64 %170
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
%173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8
%174 = shl i32 %.098108.lcssa, 8
%175 = or i32 %174, 256
%176 = add nsw i32 %175, %43
%177 = icmp slt i32 %176, %3
br i1 %177, label %198, label %.thread.preheader
; <label>:178: ; preds = %.thread
%.lcssa138 = phi float [ %180, %.thread ]
%.not99 = xor i1 %44, true
%brmerge = or i1 %.not, %.not99
br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183
.thread: ; preds = %.thread.preheader, %.thread
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
%180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8
%181 = lshr i32 %.092113, 1
%182 = icmp eq i32 %181, 0
br i1 %182, label %178, label %.thread, !llvm.loop !59
; <label>:183: ; preds = %178
%184 = load float*, float** %37, align 8
%185 = sext i32 %41 to i64
%186 = getelementptr inbounds float, float* %184, i64 %185
%187 = bitcast float %.lcssa138 to i32
%188 = bitcast float* %186 to i32*
%189 = load i32, i32* %188, align 4
br label %190
; <label>:190: ; preds = %193, %183
%.011.i = phi i32 [ %189, %183 ], [ %195, %193 ]
%191 = bitcast i32 %.011.i to float
%192 = fcmp olt float %191, %.lcssa138
br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit
; <label>:193: ; preds = %190
%194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst
%195 = extractvalue { i32, i1 } %194, 0
%not..i = icmp eq i32 %.011.i, %195
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193
br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178
%196 = add nuw nsw i32 %.0114, 32
%197 = icmp slt i32 %196, %32
br i1 %197, label %39, label %._crit_edge.loopexit
; <label>:198: ; preds = %168
%199 = add nsw i32 %176, %45
%200 = sext i32 %199 to i64
%201 = getelementptr inbounds float, float* %165, i64 %200
%202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8
%203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8
%204 = shl i32 %.098108.lcssa, 8
%205 = or i32 %204, 512
%206 = add nsw i32 %205, %43
%207 = icmp slt i32 %206, %3
br i1 %207, label %208, label %.thread.preheader
; <label>:208: ; preds = %198
%209 = add nsw i32 %206, %45
%210 = sext i32 %209 to i64
%211 = getelementptr inbounds float, float* %165, i64 %210
%212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8
%213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8
%214 = shl i32 %.098108.lcssa, 8
%215 = or i32 %214, 768
%216 = add nsw i32 %215, %43
%217 = icmp slt i32 %216, %3
br i1 %217, label %218, label %.thread.preheader
; <label>:218: ; preds = %208
%219 = add nsw i32 %216, %45
%220 = sext i32 %219 to i64
%221 = getelementptr inbounds float, float* %165, i64 %220
%222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8
%223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8
%224 = shl i32 %.098108.lcssa, 8
%225 = or i32 %224, 1024
%226 = add nsw i32 %225, %43
%227 = icmp slt i32 %226, %3
br i1 %227, label %228, label %.thread.preheader
; <label>:228: ; preds = %218
%229 = add nsw i32 %226, %45
%230 = sext i32 %229 to i64
%231 = getelementptr inbounds float, float* %165, i64 %230
%232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8
%233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8
%234 = shl i32 %.098108.lcssa, 8
%235 = or i32 %234, 1280
%236 = add nsw i32 %235, %43
%237 = icmp slt i32 %236, %3
br i1 %237, label %238, label %.thread.preheader
; <label>:238: ; preds = %228
%239 = add nsw i32 %236, %45
%240 = sext i32 %239 to i64
%241 = getelementptr inbounds float, float* %165, i64 %240
%242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8
%243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8
%244 = shl i32 %.098108.lcssa, 8
%245 = or i32 %244, 1536
%246 = add nsw i32 %245, %43
%247 = icmp slt i32 %246, %3
br i1 %247, label %248, label %.thread.preheader
; <label>:248: ; preds = %238
%249 = add nsw i32 %246, %45
%250 = sext i32 %249 to i64
%251 = getelementptr inbounds float, float* %165, i64 %250
%252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8
%253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8
%254 = shl i32 %.098108.lcssa, 8
%255 = or i32 %254, 1792
%256 = add nsw i32 %255, %43
%257 = icmp slt i32 %256, %3
br i1 %257, label %258, label %.thread.preheader
; <label>:258: ; preds = %248
%259 = add nsw i32 %256, %45
%260 = sext i32 %259 to i64
%261 = getelementptr inbounds float, float* %165, i64 %260
%262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8
%263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8
%264 = shl i32 %.098108.lcssa, 8
%265 = or i32 %264, 2048
%266 = add nsw i32 %265, %43
%267 = icmp slt i32 %266, %3
br i1 %267, label %268, label %.thread.preheader
; <label>:268: ; preds = %258
%269 = add nsw i32 %266, %45
%270 = sext i32 %269 to i64
%271 = getelementptr inbounds float, float* %165, i64 %270
%272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8
%273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8
%274 = shl i32 %.098108.lcssa, 8
%275 = or i32 %274, 2304
%276 = add nsw i32 %275, %43
%277 = icmp slt i32 %276, %3
br i1 %277, label %278, label %.thread.preheader
; <label>:278: ; preds = %268
%279 = add nsw i32 %276, %45
%280 = sext i32 %279 to i64
%281 = getelementptr inbounds float, float* %165, i64 %280
%282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8
%283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8
%284 = shl i32 %.098108.lcssa, 8
%285 = or i32 %284, 2560
%286 = add nsw i32 %285, %43
%287 = icmp slt i32 %286, %3
br i1 %287, label %288, label %.thread.preheader
; <label>:288: ; preds = %278
%289 = add nsw i32 %286, %45
%290 = sext i32 %289 to i64
%291 = getelementptr inbounds float, float* %165, i64 %290
%292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8
%293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8
%294 = shl i32 %.098108.lcssa, 8
%295 = or i32 %294, 2816
%296 = add nsw i32 %295, %43
%297 = icmp slt i32 %296, %3
br i1 %297, label %298, label %.thread.preheader
; <label>:298: ; preds = %288
%299 = add nsw i32 %296, %45
%300 = sext i32 %299 to i64
%301 = getelementptr inbounds float, float* %165, i64 %300
%302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8
%303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8
%304 = shl i32 %.098108.lcssa, 8
%305 = or i32 %304, 3072
%306 = add nsw i32 %305, %43
%307 = icmp slt i32 %306, %3
br i1 %307, label %308, label %.thread.preheader
; <label>:308: ; preds = %298
%309 = add nsw i32 %306, %45
%310 = sext i32 %309 to i64
%311 = getelementptr inbounds float, float* %165, i64 %310
%312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8
%313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8
%314 = shl i32 %.098108.lcssa, 8
%315 = or i32 %314, 3328
%316 = add nsw i32 %315, %43
%317 = icmp slt i32 %316, %3
br i1 %317, label %318, label %.thread.preheader
; <label>:318: ; preds = %308
%319 = add nsw i32 %316, %45
%320 = sext i32 %319 to i64
%321 = getelementptr inbounds float, float* %165, i64 %320
%322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8
%323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8
%324 = shl i32 %.098108.lcssa, 8
%325 = or i32 %324, 3584
%326 = add nsw i32 %325, %43
%327 = icmp slt i32 %326, %3
br i1 %327, label %328, label %.thread.preheader
; <label>:328: ; preds = %318
%329 = add nsw i32 %326, %45
%330 = sext i32 %329 to i64
%331 = getelementptr inbounds float, float* %165, i64 %330
%332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8
%333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8
br label %.thread.preheader
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 128
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %2, 15
%31 = sdiv i32 %30, 16
%32 = mul nsw i32 %31, %3
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = shl nuw nsw i32 %33, 8
%36 = add nuw nsw i32 %35, %34
%37 = icmp slt i32 %36, %32
br i1 %37, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%38 = icmp sgt i32 %3, -1
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
%40 = load float*, float** %39, align 8
%41 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
%42 = load float*, float** %41, align 8
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader
.lr.ph.split.preheader: ; preds = %.lr.ph
%43 = add i32 %32, -1
%44 = sub i32 %43, %34
%45 = sub i32 %44, %35
%46 = lshr i32 %45, 15
%47 = add nuw nsw i32 %46, 1
%xtraiter = and i32 %47, 3
%48 = icmp ult i32 %45, 98304
br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new
.lr.ph.split.preheader.new: ; preds = %.lr.ph.split.preheader
%unroll_iter = sub nsw i32 %47, %xtraiter
br label %.lr.ph.split
.lr.ph.split.us.preheader: ; preds = %.lr.ph
br label %.lr.ph.split.us
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us
%.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ]
%49 = srem i32 %.047.us, %3
%50 = sdiv i32 %.047.us, %3
%51 = srem i32 %50, %31
%52 = shl nsw i32 %51, 4
br label %53
; <label>:53: ; preds = %104, %.lr.ph.split.us
%.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ]
%.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ]
%54 = add nuw nsw i32 %.04346.us.us, %52
%55 = icmp slt i32 %54, %2
br i1 %55, label %56, label %62
; <label>:56: ; preds = %53
%57 = mul nsw i32 %54, %3
%58 = add nsw i32 %57, %49
%59 = sext i32 %58 to i64
%60 = getelementptr inbounds float, float* %40, i64 %59
%61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8
br label %62
; <label>:62: ; preds = %56, %53
%63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ]
%64 = fadd float %.04445.us.us, %63
%65 = or i32 %.04346.us.us, 1
%66 = add nuw nsw i32 %65, %52
%67 = icmp slt i32 %66, %2
br i1 %67, label %98, label %104
.us-lcssa.us.us: ; preds = %104
%.lcssa = phi float [ %106, %104 ]
%68 = sext i32 %49 to i64
%69 = getelementptr inbounds float, float* %42, i64 %68
%70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8
%71 = add nuw nsw i32 %.047.us, 32768
%72 = icmp slt i32 %71, %32
br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit
._crit_edge.loopexit: ; preds = %.us-lcssa.us.us
br label %._crit_edge
._crit_edge.loopexit59.unr-lcssa.loopexit: ; preds = %.lr.ph.split
%.lcssa60 = phi i32 [ %97, %.lr.ph.split ]
br label %._crit_edge.loopexit59.unr-lcssa
._crit_edge.loopexit59.unr-lcssa: ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader
%.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ]
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader
.lr.ph.split.epil.preheader: ; preds = %._crit_edge.loopexit59.unr-lcssa
br label %.lr.ph.split.epil
.lr.ph.split.epil: ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader
%.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ]
%epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ]
%73 = srem i32 %.047.epil, %3
%74 = sext i32 %73 to i64
%75 = getelementptr inbounds float, float* %42, i64 %74
%76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8
%77 = add nuw nsw i32 %.047.epil, 32768
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !60
._crit_edge.loopexit59.epilog-lcssa: ; preds = %.lr.ph.split.epil
br label %._crit_edge.loopexit59
._crit_edge.loopexit59: ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29
ret void
.lr.ph.split: ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new
%.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ]
%78 = srem i32 %.047, %3
%79 = sext i32 %78 to i64
%80 = getelementptr inbounds float, float* %42, i64 %79
%81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8
%82 = add nuw nsw i32 %.047, 32768
%83 = srem i32 %82, %3
%84 = sext i32 %83 to i64
%85 = getelementptr inbounds float, float* %42, i64 %84
%86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8
%87 = add nsw i32 %.047, 65536
%88 = srem i32 %87, %3
%89 = sext i32 %88 to i64
%90 = getelementptr inbounds float, float* %42, i64 %89
%91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8
%92 = add nsw i32 %.047, 98304
%93 = srem i32 %92, %3
%94 = sext i32 %93 to i64
%95 = getelementptr inbounds float, float* %42, i64 %94
%96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8
%97 = add nsw i32 %.047, 131072
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split
; <label>:98: ; preds = %62
%99 = mul nsw i32 %66, %3
%100 = add nsw i32 %99, %49
%101 = sext i32 %100 to i64
%102 = getelementptr inbounds float, float* %40, i64 %101
%103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8
br label %104
; <label>:104: ; preds = %98, %62
%105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ]
%106 = fadd float %64, %105
%107 = add nsw i32 %.04346.us.us, 2
%exitcond.1 = icmp eq i32 %107, 16
br i1 %exitcond.1, label %.us-lcssa.us.us, label %53
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 128
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %2, 15
%31 = sdiv i32 %30, 16
%32 = mul nsw i32 %31, %3
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = shl nuw nsw i32 %33, 8
%36 = add nuw nsw i32 %35, %34
%37 = icmp slt i32 %36, %32
br i1 %37, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
%38 = icmp sgt i32 %3, -1
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
%40 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader
.lr.ph.split.preheader: ; preds = %.lr.ph
br label %.lr.ph.split
.lr.ph.split.us.preheader: ; preds = %.lr.ph
br label %.lr.ph.split.us
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
%.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ]
%41 = srem i32 %.048.us, %3
%42 = sdiv i32 %.048.us, %3
%43 = srem i32 %42, %31
%44 = shl nsw i32 %43, 4
%.idx45.val.us = load float, float* %.idx45, align 4
%45 = load float*, float** %39, align 8
br label %54
; <label>:46: ; preds = %49, %.us-lcssa.us.us
%.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ]
%47 = bitcast i32 %.011.i.us to float
%48 = fcmp olt float %47, %.lcssa
br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
; <label>:49: ; preds = %46
%50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst
%51 = extractvalue { i32, i1 } %50, 0
%not..i.us = icmp eq i32 %.011.i.us, %51
br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46
%52 = add nuw nsw i32 %.048.us, 32768
%53 = icmp slt i32 %52, %32
br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit
; <label>:54: ; preds = %112, %.lr.ph.split.us
%.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ]
%.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ]
%55 = add nuw nsw i32 %.04347.us.us, %44
%56 = icmp slt i32 %55, %2
br i1 %56, label %57, label %63
; <label>:57: ; preds = %54
%58 = mul nsw i32 %55, %3
%59 = add nsw i32 %58, %41
%60 = sext i32 %59 to i64
%61 = getelementptr inbounds float, float* %45, i64 %60
%62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8
br label %63
; <label>:63: ; preds = %54, %57
%64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ]
%65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8
%66 = or i32 %.04347.us.us, 1
%67 = add nuw nsw i32 %66, %44
%68 = icmp slt i32 %67, %2
br i1 %68, label %106, label %112
.us-lcssa.us.us: ; preds = %112
%.lcssa = phi float [ %114, %112 ]
%69 = load float*, float** %40, align 8
%70 = sext i32 %41 to i64
%71 = getelementptr inbounds float, float* %69, i64 %70
%72 = bitcast float %.lcssa to i32
%73 = bitcast float* %71 to i32*
%74 = load i32, i32* %73, align 4
br label %46
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
br label %._crit_edge
._crit_edge.loopexit60: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29
ret void
.lr.ph.split: ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
%.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ]
%.idx45.val = load float, float* %.idx45, align 4
%75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8
%76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8
%77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8
%78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8
%79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8
%80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8
%81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8
%82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8
%83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8
%84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8
%85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8
%86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8
%87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8
%88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8
%89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8
%90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8
%91 = srem i32 %.048, %3
%92 = load float*, float** %40, align 8
%93 = sext i32 %91 to i64
%94 = getelementptr inbounds float, float* %92, i64 %93
%95 = bitcast float %90 to i32
%96 = bitcast float* %94 to i32*
%97 = load i32, i32* %96, align 4
br label %98
; <label>:98: ; preds = %101, %.lr.ph.split
%.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ]
%99 = bitcast i32 %.011.i to float
%100 = fcmp olt float %99, %90
br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
; <label>:101: ; preds = %98
%102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst
%103 = extractvalue { i32, i1 } %102, 0
%not..i = icmp eq i32 %.011.i, %103
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101
%104 = add nuw nsw i32 %.048, 32768
%105 = icmp slt i32 %104, %32
br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60
; <label>:106: ; preds = %63
%107 = mul nsw i32 %67, %3
%108 = add nsw i32 %107, %41
%109 = sext i32 %108 to i64
%110 = getelementptr inbounds float, float* %45, i64 %109
%111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8
br label %112
; <label>:112: ; preds = %106, %63
%113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ]
%114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8
%115 = add nsw i32 %.04347.us.us, 2
%exitcond.1 = icmp eq i32 %115, 16
br i1 %exitcond.1, label %.us-lcssa.us.us, label %54
}
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.6"* byval align 8, i32) #0 comdat {
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%5 = mul nuw nsw i32 %4, %3
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%7 = add nuw nsw i32 %5, %6
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%9 = mul nuw nsw i32 %8, %4
%.sroa.021.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %0, i64 0, i32 0, i32 0
%.sroa.021.0.copyload = load float*, float** %.sroa.021.0..sroa_idx, align 8
%.sroa.5.0..sroa_idx25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %0, i64 0, i32 1, i32 3
%.sroa.5.0.copyload = load float*, float** %.sroa.5.0..sroa_idx25, align 8
%10 = icmp slt i32 %7, %1
br i1 %10, label %.lr.ph.i.preheader, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit
.lr.ph.i.preheader: ; preds = %2
br label %.lr.ph.i
.lr.ph.i: ; preds = %.lr.ph.i.preheader, %.lr.ph.i
%.07.i = phi i32 [ %17, %.lr.ph.i ], [ %7, %.lr.ph.i.preheader ]
%11 = sext i32 %.07.i to i64
%12 = getelementptr inbounds float, float* %.sroa.5.0.copyload, i64 %11
%13 = bitcast float* %12 to i32*
%14 = load i32, i32* %13, align 4
%15 = getelementptr inbounds float, float* %.sroa.021.0.copyload, i64 %11
%16 = bitcast float* %15 to i32*
store i32 %14, i32* %16, align 4
%17 = add nsw i32 %.07.i, %9
%18 = icmp slt i32 %17, %1
br i1 %18, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit: ; preds = %.lr.ph.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit, %2
ret void
}
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.11"* byval align 8, i32) #0 comdat {
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%5 = mul nuw nsw i32 %4, %3
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%7 = add nuw nsw i32 %5, %6
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%9 = mul nuw nsw i32 %8, %4
%.sroa.041.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 0, i32 0
%.sroa.041.0.copyload = load float*, float** %.sroa.041.0..sroa_idx, align 8
%.sroa.545.0..sroa_idx46 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 1, i32 7
%.sroa.545.0..sroa_cast = bitcast i32* %.sroa.545.0..sroa_idx46 to i64*
%.sroa.545.0.copyload = load i64, i64* %.sroa.545.0..sroa_cast, align 8
%.sroa.648.0..sroa_idx49 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 1, i32 9, i32 0, i64 0
%.sroa.648.0.copyload = load i32, i32* %.sroa.648.0..sroa_idx49, align 8
%.sroa.8.0..sroa_idx53 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 1, i32 10, i32 0
%.sroa.8.0.copyload = load float*, float** %.sroa.8.0..sroa_idx53, align 8
%10 = icmp slt i32 %7, %1
br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit
.lr.ph.i: ; preds = %2
%11 = trunc i64 %.sroa.545.0.copyload to i32
%12 = icmp sgt i32 %.sroa.648.0.copyload, 0
%13 = lshr i64 %.sroa.545.0.copyload, 32
%14 = trunc i64 %13 to i32
br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader
.lr.ph.split.i.preheader: ; preds = %.lr.ph.i
br label %.lr.ph.split.i
.lr.ph.split.us.i.preheader: ; preds = %.lr.ph.i
%15 = add i32 %.sroa.648.0.copyload, -1
%xtraiter = and i32 %.sroa.648.0.copyload, 3
%16 = icmp ult i32 %15, 3
%lcmp.mod = icmp eq i32 %xtraiter, 0
%unroll_iter = sub i32 %.sroa.648.0.copyload, %xtraiter
br label %.lr.ph.split.us.i
.lr.ph.split.us.i: ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
%.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ]
%17 = mul nsw i32 %.07.us.i, %11
br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new
.lr.ph.split.us.i.new: ; preds = %.lr.ph.split.us.i
br label %18
; <label>:18: ; preds = %18, %.lr.ph.split.us.i.new
%19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ]
%.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ]
%20 = mul nsw i32 %.012.i.i.i.us.i, %14
%21 = add nsw i32 %20, %17
%22 = sext i32 %21 to i64
%23 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %22
%24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8
%25 = fadd float %19, %24
%26 = or i32 %.012.i.i.i.us.i, 1
%27 = mul nsw i32 %26, %14
%28 = add nsw i32 %27, %17
%29 = sext i32 %28 to i64
%30 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %29
%31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8
%32 = fadd float %25, %31
%33 = or i32 %.012.i.i.i.us.i, 2
%34 = mul nsw i32 %33, %14
%35 = add nsw i32 %34, %17
%36 = sext i32 %35 to i64
%37 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %36
%38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8
%39 = fadd float %32, %38
%40 = or i32 %.012.i.i.i.us.i, 3
%41 = mul nsw i32 %40, %14
%42 = add nsw i32 %41, %17
%43 = sext i32 %42 to i64
%44 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %43
%45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8
%46 = fadd float %39, %45
%47 = add nsw i32 %.012.i.i.i.us.i, 4
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18
%.lcssa67 = phi i32 [ %47, %18 ]
%.lcssa66 = phi float [ %46, %18 ]
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i
%.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
%.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
%.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader
.epil.preheader: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
br label %48
; <label>:48: ; preds = %48, %.epil.preheader
%49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ]
%.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ]
%epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ]
%50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14
%51 = add nsw i32 %50, %17
%52 = sext i32 %51 to i64
%53 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %52
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
%55 = fadd float %49, %54
%56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !61
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48
%.lcssa68 = phi float [ %55, %48 ]
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa
%.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa68, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ]
%57 = sext i32 %.07.us.i to i64
%58 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %57
store float %.lcssa, float* %58, align 4
%59 = add nsw i32 %.07.us.i, %9
%60 = icmp slt i32 %59, %1
br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit
.lr.ph.split.i: ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i
%.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ]
%61 = sext i32 %.07.i to i64
%62 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %61
store float 0.000000e+00, float* %62, align 4
%63 = add nsw i32 %.07.i, %9
%64 = icmp slt i32 %63, %1
br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64: ; preds = %.lr.ph.split.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit, %2
ret void
}
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat {
%4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%6 = mul nuw nsw i32 %5, %4
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%8 = add nuw nsw i32 %6, %7
%9 = icmp slt i32 %8, %1
br i1 %9, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %3
%10 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %2, i64 0, i32 0
%11 = load float*, float** %10, align 8
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%13 = mul nuw nsw i32 %12, %5
br label %14
._crit_edge.loopexit: ; preds = %14
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %3
ret void
; <label>:14: ; preds = %.lr.ph, %14
%.08 = phi i32 [ %8, %.lr.ph ], [ %17, %14 ]
%15 = sext i32 %.08 to i64
%16 = getelementptr inbounds float, float* %11, i64 %15
store float %0, float* %16, align 4
%17 = add i32 %13, %.08
%18 = icmp slt i32 %17, %1
br i1 %18, label %14, label %._crit_edge.loopexit
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 32
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %3, 32767
%31 = sdiv i32 %30, 32768
%32 = mul nsw i32 %31, %2
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = icmp slt i32 %33, %32
br i1 %35, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%36 = and i32 %34, 31
%.not = icmp ne i32 %36, 0
%37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0
%38 = load float*, float** %37, align 8
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
%40 = load float*, float** %39, align 8
br label %41
._crit_edge.loopexit: ; preds = %187
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %29
ret void
; <label>:41: ; preds = %.lr.ph, %187
%.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ]
%42 = srem i32 %.0114, %31
%43 = sdiv i32 %.0114, %31
%44 = shl nsw i32 %42, 15
%45 = or i32 %44, %34
%46 = icmp slt i32 %43, %2
br i1 %46, label %.preheader102, label %.thread.preheader
.thread.preheader.loopexit: ; preds = %.preheader.preheader
%.lcssa137 = phi float [ %164, %.preheader.preheader ]
br label %.thread.preheader
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41
%.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
br label %.thread
.preheader102: ; preds = %41
%47 = mul nsw i32 %43, %3
%48 = add i32 %47, %45
br label %49
; <label>:49: ; preds = %.preheader102, %.preheader.preheader
%.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ]
%.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ]
%50 = add nuw nsw i32 %.098108, 16
%51 = shl i32 %.098108, 8
%52 = or i32 %51, 3840
%53 = add nsw i32 %52, %45
%54 = icmp slt i32 %53, %3
br i1 %54, label %.preheader.preheader, label %.preheader101
.preheader.preheader: ; preds = %49
%55 = add i32 %48, %51
%56 = sext i32 %55 to i64
%57 = getelementptr inbounds float, float* %40, i64 %56
%58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8
%59 = fadd float %.095109, %58
%60 = shl i32 %.098108, 8
%61 = or i32 %60, 256
%62 = add i32 %48, %61
%63 = sext i32 %62 to i64
%64 = getelementptr inbounds float, float* %40, i64 %63
%65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8
%66 = fadd float %59, %65
%67 = shl i32 %.098108, 8
%68 = or i32 %67, 512
%69 = add i32 %48, %68
%70 = sext i32 %69 to i64
%71 = getelementptr inbounds float, float* %40, i64 %70
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
%73 = fadd float %66, %72
%74 = shl i32 %.098108, 8
%75 = or i32 %74, 768
%76 = add i32 %48, %75
%77 = sext i32 %76 to i64
%78 = getelementptr inbounds float, float* %40, i64 %77
%79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8
%80 = fadd float %73, %79
%81 = shl i32 %.098108, 8
%82 = or i32 %81, 1024
%83 = add i32 %48, %82
%84 = sext i32 %83 to i64
%85 = getelementptr inbounds float, float* %40, i64 %84
%86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8
%87 = fadd float %80, %86
%88 = shl i32 %.098108, 8
%89 = or i32 %88, 1280
%90 = add i32 %48, %89
%91 = sext i32 %90 to i64
%92 = getelementptr inbounds float, float* %40, i64 %91
%93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8
%94 = fadd float %87, %93
%95 = shl i32 %.098108, 8
%96 = or i32 %95, 1536
%97 = add i32 %48, %96
%98 = sext i32 %97 to i64
%99 = getelementptr inbounds float, float* %40, i64 %98
%100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8
%101 = fadd float %94, %100
%102 = shl i32 %.098108, 8
%103 = or i32 %102, 1792
%104 = add i32 %48, %103
%105 = sext i32 %104 to i64
%106 = getelementptr inbounds float, float* %40, i64 %105
%107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8
%108 = fadd float %101, %107
%109 = shl i32 %.098108, 8
%110 = or i32 %109, 2048
%111 = add i32 %48, %110
%112 = sext i32 %111 to i64
%113 = getelementptr inbounds float, float* %40, i64 %112
%114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8
%115 = fadd float %108, %114
%116 = shl i32 %.098108, 8
%117 = or i32 %116, 2304
%118 = add i32 %48, %117
%119 = sext i32 %118 to i64
%120 = getelementptr inbounds float, float* %40, i64 %119
%121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8
%122 = fadd float %115, %121
%123 = shl i32 %.098108, 8
%124 = or i32 %123, 2560
%125 = add i32 %48, %124
%126 = sext i32 %125 to i64
%127 = getelementptr inbounds float, float* %40, i64 %126
%128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8
%129 = fadd float %122, %128
%130 = shl i32 %.098108, 8
%131 = or i32 %130, 2816
%132 = add i32 %48, %131
%133 = sext i32 %132 to i64
%134 = getelementptr inbounds float, float* %40, i64 %133
%135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8
%136 = fadd float %129, %135
%137 = shl i32 %.098108, 8
%138 = or i32 %137, 3072
%139 = add i32 %48, %138
%140 = sext i32 %139 to i64
%141 = getelementptr inbounds float, float* %40, i64 %140
%142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8
%143 = fadd float %136, %142
%144 = shl i32 %.098108, 8
%145 = or i32 %144, 3328
%146 = add i32 %48, %145
%147 = sext i32 %146 to i64
%148 = getelementptr inbounds float, float* %40, i64 %147
%149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8
%150 = fadd float %143, %149
%151 = shl i32 %.098108, 8
%152 = or i32 %151, 3584
%153 = add i32 %48, %152
%154 = sext i32 %153 to i64
%155 = getelementptr inbounds float, float* %40, i64 %154
%156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8
%157 = fadd float %150, %156
%158 = shl i32 %.098108, 8
%159 = or i32 %158, 3840
%160 = add i32 %48, %159
%161 = sext i32 %160 to i64
%162 = getelementptr inbounds float, float* %40, i64 %161
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
%164 = fadd float %157, %163
%165 = icmp slt i32 %50, 128
br i1 %165, label %49, label %.thread.preheader.loopexit
.preheader101: ; preds = %49
%.lcssa = phi i32 [ %51, %49 ]
%.098108.lcssa = phi i32 [ %.098108, %49 ]
%.095109.lcssa = phi float [ %.095109, %49 ]
%166 = add nsw i32 %.lcssa, %45
%167 = icmp slt i32 %166, %3
br i1 %167, label %168, label %.thread.preheader
; <label>:168: ; preds = %.preheader101
%169 = add nsw i32 %166, %47
%170 = sext i32 %169 to i64
%171 = getelementptr inbounds float, float* %40, i64 %170
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
%173 = fadd float %.095109.lcssa, %172
%174 = shl i32 %.098108.lcssa, 8
%175 = or i32 %174, 256
%176 = add nsw i32 %175, %45
%177 = icmp slt i32 %176, %3
br i1 %177, label %190, label %.thread.preheader
; <label>:178: ; preds = %.thread
%.lcssa138 = phi float [ %180, %.thread ]
%.not99 = xor i1 %46, true
%brmerge = or i1 %.not, %.not99
br i1 %brmerge, label %187, label %183
.thread: ; preds = %.thread.preheader, %.thread
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
%180 = fadd float %.8112, %179
%181 = lshr i32 %.092113, 1
%182 = icmp eq i32 %181, 0
br i1 %182, label %178, label %.thread, !llvm.loop !62
; <label>:183: ; preds = %178
%184 = sext i32 %43 to i64
%185 = getelementptr inbounds float, float* %38, i64 %184
%186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8
br label %187
; <label>:187: ; preds = %178, %183
%188 = add nuw nsw i32 %.0114, 32
%189 = icmp slt i32 %188, %32
br i1 %189, label %41, label %._crit_edge.loopexit
; <label>:190: ; preds = %168
%191 = add nsw i32 %176, %47
%192 = sext i32 %191 to i64
%193 = getelementptr inbounds float, float* %40, i64 %192
%194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8
%195 = fadd float %173, %194
%196 = shl i32 %.098108.lcssa, 8
%197 = or i32 %196, 512
%198 = add nsw i32 %197, %45
%199 = icmp slt i32 %198, %3
br i1 %199, label %200, label %.thread.preheader
; <label>:200: ; preds = %190
%201 = add nsw i32 %198, %47
%202 = sext i32 %201 to i64
%203 = getelementptr inbounds float, float* %40, i64 %202
%204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8
%205 = fadd float %195, %204
%206 = shl i32 %.098108.lcssa, 8
%207 = or i32 %206, 768
%208 = add nsw i32 %207, %45
%209 = icmp slt i32 %208, %3
br i1 %209, label %210, label %.thread.preheader
; <label>:210: ; preds = %200
%211 = add nsw i32 %208, %47
%212 = sext i32 %211 to i64
%213 = getelementptr inbounds float, float* %40, i64 %212
%214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8
%215 = fadd float %205, %214
%216 = shl i32 %.098108.lcssa, 8
%217 = or i32 %216, 1024
%218 = add nsw i32 %217, %45
%219 = icmp slt i32 %218, %3
br i1 %219, label %220, label %.thread.preheader
; <label>:220: ; preds = %210
%221 = add nsw i32 %218, %47
%222 = sext i32 %221 to i64
%223 = getelementptr inbounds float, float* %40, i64 %222
%224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8
%225 = fadd float %215, %224
%226 = shl i32 %.098108.lcssa, 8
%227 = or i32 %226, 1280
%228 = add nsw i32 %227, %45
%229 = icmp slt i32 %228, %3
br i1 %229, label %230, label %.thread.preheader
; <label>:230: ; preds = %220
%231 = add nsw i32 %228, %47
%232 = sext i32 %231 to i64
%233 = getelementptr inbounds float, float* %40, i64 %232
%234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8
%235 = fadd float %225, %234
%236 = shl i32 %.098108.lcssa, 8
%237 = or i32 %236, 1536
%238 = add nsw i32 %237, %45
%239 = icmp slt i32 %238, %3
br i1 %239, label %240, label %.thread.preheader
; <label>:240: ; preds = %230
%241 = add nsw i32 %238, %47
%242 = sext i32 %241 to i64
%243 = getelementptr inbounds float, float* %40, i64 %242
%244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8
%245 = fadd float %235, %244
%246 = shl i32 %.098108.lcssa, 8
%247 = or i32 %246, 1792
%248 = add nsw i32 %247, %45
%249 = icmp slt i32 %248, %3
br i1 %249, label %250, label %.thread.preheader
; <label>:250: ; preds = %240
%251 = add nsw i32 %248, %47
%252 = sext i32 %251 to i64
%253 = getelementptr inbounds float, float* %40, i64 %252
%254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8
%255 = fadd float %245, %254
%256 = shl i32 %.098108.lcssa, 8
%257 = or i32 %256, 2048
%258 = add nsw i32 %257, %45
%259 = icmp slt i32 %258, %3
br i1 %259, label %260, label %.thread.preheader
; <label>:260: ; preds = %250
%261 = add nsw i32 %258, %47
%262 = sext i32 %261 to i64
%263 = getelementptr inbounds float, float* %40, i64 %262
%264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8
%265 = fadd float %255, %264
%266 = shl i32 %.098108.lcssa, 8
%267 = or i32 %266, 2304
%268 = add nsw i32 %267, %45
%269 = icmp slt i32 %268, %3
br i1 %269, label %270, label %.thread.preheader
; <label>:270: ; preds = %260
%271 = add nsw i32 %268, %47
%272 = sext i32 %271 to i64
%273 = getelementptr inbounds float, float* %40, i64 %272
%274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8
%275 = fadd float %265, %274
%276 = shl i32 %.098108.lcssa, 8
%277 = or i32 %276, 2560
%278 = add nsw i32 %277, %45
%279 = icmp slt i32 %278, %3
br i1 %279, label %280, label %.thread.preheader
; <label>:280: ; preds = %270
%281 = add nsw i32 %278, %47
%282 = sext i32 %281 to i64
%283 = getelementptr inbounds float, float* %40, i64 %282
%284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8
%285 = fadd float %275, %284
%286 = shl i32 %.098108.lcssa, 8
%287 = or i32 %286, 2816
%288 = add nsw i32 %287, %45
%289 = icmp slt i32 %288, %3
br i1 %289, label %290, label %.thread.preheader
; <label>:290: ; preds = %280
%291 = add nsw i32 %288, %47
%292 = sext i32 %291 to i64
%293 = getelementptr inbounds float, float* %40, i64 %292
%294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8
%295 = fadd float %285, %294
%296 = shl i32 %.098108.lcssa, 8
%297 = or i32 %296, 3072
%298 = add nsw i32 %297, %45
%299 = icmp slt i32 %298, %3
br i1 %299, label %300, label %.thread.preheader
; <label>:300: ; preds = %290
%301 = add nsw i32 %298, %47
%302 = sext i32 %301 to i64
%303 = getelementptr inbounds float, float* %40, i64 %302
%304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8
%305 = fadd float %295, %304
%306 = shl i32 %.098108.lcssa, 8
%307 = or i32 %306, 3328
%308 = add nsw i32 %307, %45
%309 = icmp slt i32 %308, %3
br i1 %309, label %310, label %.thread.preheader
; <label>:310: ; preds = %300
%311 = add nsw i32 %308, %47
%312 = sext i32 %311 to i64
%313 = getelementptr inbounds float, float* %40, i64 %312
%314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8
%315 = fadd float %305, %314
%316 = shl i32 %.098108.lcssa, 8
%317 = or i32 %316, 3584
%318 = add nsw i32 %317, %45
%319 = icmp slt i32 %318, %3
br i1 %319, label %320, label %.thread.preheader
; <label>:320: ; preds = %310
%321 = add nsw i32 %318, %47
%322 = sext i32 %321 to i64
%323 = getelementptr inbounds float, float* %40, i64 %322
%324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8
%325 = fadd float %315, %324
br label %.thread.preheader
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 32
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %3, 32767
%31 = sdiv i32 %30, 32768
%32 = mul nsw i32 %31, %2
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = icmp slt i32 %33, %32
br i1 %35, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
%36 = and i32 %34, 31
%.not = icmp ne i32 %36, 0
%37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0
%38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
br label %39
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %29
ret void
; <label>:39: ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
%.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ]
%40 = srem i32 %.0114, %31
%41 = sdiv i32 %.0114, %31
%42 = shl nsw i32 %40, 15
%43 = or i32 %42, %34
%.idx.val = load float, float* %.idx, align 4
%44 = icmp slt i32 %41, %2
br i1 %44, label %.preheader102, label %.thread.preheader
.thread.preheader.loopexit: ; preds = %.preheader.preheader
%.lcssa137 = phi float [ %163, %.preheader.preheader ]
br label %.thread.preheader
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39
%.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
br label %.thread
.preheader102: ; preds = %39
%45 = mul nsw i32 %41, %3
%46 = add i32 %45, %43
%47 = load float*, float** %38, align 8
br label %48
; <label>:48: ; preds = %.preheader102, %.preheader.preheader
%.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ]
%.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ]
%49 = add nuw nsw i32 %.098108, 16
%50 = shl i32 %.098108, 8
%51 = or i32 %50, 3840
%52 = add nsw i32 %51, %43
%53 = icmp slt i32 %52, %3
br i1 %53, label %.preheader.preheader, label %.preheader101
.preheader.preheader: ; preds = %48
%54 = add i32 %46, %50
%55 = sext i32 %54 to i64
%56 = getelementptr inbounds float, float* %47, i64 %55
%57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8
%58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8
%59 = shl i32 %.098108, 8
%60 = or i32 %59, 256
%61 = add i32 %46, %60
%62 = sext i32 %61 to i64
%63 = getelementptr inbounds float, float* %47, i64 %62
%64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8
%65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8
%66 = shl i32 %.098108, 8
%67 = or i32 %66, 512
%68 = add i32 %46, %67
%69 = sext i32 %68 to i64
%70 = getelementptr inbounds float, float* %47, i64 %69
%71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8
%72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8
%73 = shl i32 %.098108, 8
%74 = or i32 %73, 768
%75 = add i32 %46, %74
%76 = sext i32 %75 to i64
%77 = getelementptr inbounds float, float* %47, i64 %76
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
%79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8
%80 = shl i32 %.098108, 8
%81 = or i32 %80, 1024
%82 = add i32 %46, %81
%83 = sext i32 %82 to i64
%84 = getelementptr inbounds float, float* %47, i64 %83
%85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8
%86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8
%87 = shl i32 %.098108, 8
%88 = or i32 %87, 1280
%89 = add i32 %46, %88
%90 = sext i32 %89 to i64
%91 = getelementptr inbounds float, float* %47, i64 %90
%92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8
%93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8
%94 = shl i32 %.098108, 8
%95 = or i32 %94, 1536
%96 = add i32 %46, %95
%97 = sext i32 %96 to i64
%98 = getelementptr inbounds float, float* %47, i64 %97
%99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8
%100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8
%101 = shl i32 %.098108, 8
%102 = or i32 %101, 1792
%103 = add i32 %46, %102
%104 = sext i32 %103 to i64
%105 = getelementptr inbounds float, float* %47, i64 %104
%106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8
%107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8
%108 = shl i32 %.098108, 8
%109 = or i32 %108, 2048
%110 = add i32 %46, %109
%111 = sext i32 %110 to i64
%112 = getelementptr inbounds float, float* %47, i64 %111
%113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8
%114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8
%115 = shl i32 %.098108, 8
%116 = or i32 %115, 2304
%117 = add i32 %46, %116
%118 = sext i32 %117 to i64
%119 = getelementptr inbounds float, float* %47, i64 %118
%120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8
%121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8
%122 = shl i32 %.098108, 8
%123 = or i32 %122, 2560
%124 = add i32 %46, %123
%125 = sext i32 %124 to i64
%126 = getelementptr inbounds float, float* %47, i64 %125
%127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8
%128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8
%129 = shl i32 %.098108, 8
%130 = or i32 %129, 2816
%131 = add i32 %46, %130
%132 = sext i32 %131 to i64
%133 = getelementptr inbounds float, float* %47, i64 %132
%134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8
%135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8
%136 = shl i32 %.098108, 8
%137 = or i32 %136, 3072
%138 = add i32 %46, %137
%139 = sext i32 %138 to i64
%140 = getelementptr inbounds float, float* %47, i64 %139
%141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8
%142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8
%143 = shl i32 %.098108, 8
%144 = or i32 %143, 3328
%145 = add i32 %46, %144
%146 = sext i32 %145 to i64
%147 = getelementptr inbounds float, float* %47, i64 %146
%148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8
%149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8
%150 = shl i32 %.098108, 8
%151 = or i32 %150, 3584
%152 = add i32 %46, %151
%153 = sext i32 %152 to i64
%154 = getelementptr inbounds float, float* %47, i64 %153
%155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8
%156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8
%157 = shl i32 %.098108, 8
%158 = or i32 %157, 3840
%159 = add i32 %46, %158
%160 = sext i32 %159 to i64
%161 = getelementptr inbounds float, float* %47, i64 %160
%162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8
%163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8
%164 = icmp slt i32 %49, 128
br i1 %164, label %48, label %.thread.preheader.loopexit
.preheader101: ; preds = %48
%.lcssa = phi i32 [ %50, %48 ]
%.098108.lcssa = phi i32 [ %.098108, %48 ]
%.095109.lcssa = phi float [ %.095109, %48 ]
%165 = load float*, float** %38, align 8
%166 = add nsw i32 %.lcssa, %43
%167 = icmp slt i32 %166, %3
br i1 %167, label %168, label %.thread.preheader
; <label>:168: ; preds = %.preheader101
%169 = add nsw i32 %166, %45
%170 = sext i32 %169 to i64
%171 = getelementptr inbounds float, float* %165, i64 %170
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
%173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8
%174 = shl i32 %.098108.lcssa, 8
%175 = or i32 %174, 256
%176 = add nsw i32 %175, %43
%177 = icmp slt i32 %176, %3
br i1 %177, label %198, label %.thread.preheader
; <label>:178: ; preds = %.thread
%.lcssa138 = phi float [ %180, %.thread ]
%.not99 = xor i1 %44, true
%brmerge = or i1 %.not, %.not99
br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183
.thread: ; preds = %.thread.preheader, %.thread
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
%180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8
%181 = lshr i32 %.092113, 1
%182 = icmp eq i32 %181, 0
br i1 %182, label %178, label %.thread, !llvm.loop !63
; <label>:183: ; preds = %178
%184 = sext i32 %41 to i64
%185 = load float*, float** %37, align 8
%186 = getelementptr inbounds float, float* %185, i64 %184
%187 = bitcast float %.lcssa138 to i32
%188 = bitcast float* %186 to i32*
%189 = load i32, i32* %188, align 4
br label %190
; <label>:190: ; preds = %193, %183
%.011.i = phi i32 [ %189, %183 ], [ %195, %193 ]
%191 = bitcast i32 %.011.i to float
%192 = fcmp olt float %191, %.lcssa138
br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit
; <label>:193: ; preds = %190
%194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst
%195 = extractvalue { i32, i1 } %194, 0
%not..i = icmp eq i32 %.011.i, %195
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193
br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178
%196 = add nuw nsw i32 %.0114, 32
%197 = icmp slt i32 %196, %32
br i1 %197, label %39, label %._crit_edge.loopexit
; <label>:198: ; preds = %168
%199 = add nsw i32 %176, %45
%200 = sext i32 %199 to i64
%201 = getelementptr inbounds float, float* %165, i64 %200
%202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8
%203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8
%204 = shl i32 %.098108.lcssa, 8
%205 = or i32 %204, 512
%206 = add nsw i32 %205, %43
%207 = icmp slt i32 %206, %3
br i1 %207, label %208, label %.thread.preheader
; <label>:208: ; preds = %198
%209 = add nsw i32 %206, %45
%210 = sext i32 %209 to i64
%211 = getelementptr inbounds float, float* %165, i64 %210
%212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8
%213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8
%214 = shl i32 %.098108.lcssa, 8
%215 = or i32 %214, 768
%216 = add nsw i32 %215, %43
%217 = icmp slt i32 %216, %3
br i1 %217, label %218, label %.thread.preheader
; <label>:218: ; preds = %208
%219 = add nsw i32 %216, %45
%220 = sext i32 %219 to i64
%221 = getelementptr inbounds float, float* %165, i64 %220
%222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8
%223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8
%224 = shl i32 %.098108.lcssa, 8
%225 = or i32 %224, 1024
%226 = add nsw i32 %225, %43
%227 = icmp slt i32 %226, %3
br i1 %227, label %228, label %.thread.preheader
; <label>:228: ; preds = %218
%229 = add nsw i32 %226, %45
%230 = sext i32 %229 to i64
%231 = getelementptr inbounds float, float* %165, i64 %230
%232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8
%233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8
%234 = shl i32 %.098108.lcssa, 8
%235 = or i32 %234, 1280
%236 = add nsw i32 %235, %43
%237 = icmp slt i32 %236, %3
br i1 %237, label %238, label %.thread.preheader
; <label>:238: ; preds = %228
%239 = add nsw i32 %236, %45
%240 = sext i32 %239 to i64
%241 = getelementptr inbounds float, float* %165, i64 %240
%242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8
%243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8
%244 = shl i32 %.098108.lcssa, 8
%245 = or i32 %244, 1536
%246 = add nsw i32 %245, %43
%247 = icmp slt i32 %246, %3
br i1 %247, label %248, label %.thread.preheader
; <label>:248: ; preds = %238
%249 = add nsw i32 %246, %45
%250 = sext i32 %249 to i64
%251 = getelementptr inbounds float, float* %165, i64 %250
%252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8
%253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8
%254 = shl i32 %.098108.lcssa, 8
%255 = or i32 %254, 1792
%256 = add nsw i32 %255, %43
%257 = icmp slt i32 %256, %3
br i1 %257, label %258, label %.thread.preheader
; <label>:258: ; preds = %248
%259 = add nsw i32 %256, %45
%260 = sext i32 %259 to i64
%261 = getelementptr inbounds float, float* %165, i64 %260
%262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8
%263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8
%264 = shl i32 %.098108.lcssa, 8
%265 = or i32 %264, 2048
%266 = add nsw i32 %265, %43
%267 = icmp slt i32 %266, %3
br i1 %267, label %268, label %.thread.preheader
; <label>:268: ; preds = %258
%269 = add nsw i32 %266, %45
%270 = sext i32 %269 to i64
%271 = getelementptr inbounds float, float* %165, i64 %270
%272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8
%273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8
%274 = shl i32 %.098108.lcssa, 8
%275 = or i32 %274, 2304
%276 = add nsw i32 %275, %43
%277 = icmp slt i32 %276, %3
br i1 %277, label %278, label %.thread.preheader
; <label>:278: ; preds = %268
%279 = add nsw i32 %276, %45
%280 = sext i32 %279 to i64
%281 = getelementptr inbounds float, float* %165, i64 %280
%282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8
%283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8
%284 = shl i32 %.098108.lcssa, 8
%285 = or i32 %284, 2560
%286 = add nsw i32 %285, %43
%287 = icmp slt i32 %286, %3
br i1 %287, label %288, label %.thread.preheader
; <label>:288: ; preds = %278
%289 = add nsw i32 %286, %45
%290 = sext i32 %289 to i64
%291 = getelementptr inbounds float, float* %165, i64 %290
%292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8
%293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8
%294 = shl i32 %.098108.lcssa, 8
%295 = or i32 %294, 2816
%296 = add nsw i32 %295, %43
%297 = icmp slt i32 %296, %3
br i1 %297, label %298, label %.thread.preheader
; <label>:298: ; preds = %288
%299 = add nsw i32 %296, %45
%300 = sext i32 %299 to i64
%301 = getelementptr inbounds float, float* %165, i64 %300
%302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8
%303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8
%304 = shl i32 %.098108.lcssa, 8
%305 = or i32 %304, 3072
%306 = add nsw i32 %305, %43
%307 = icmp slt i32 %306, %3
br i1 %307, label %308, label %.thread.preheader
; <label>:308: ; preds = %298
%309 = add nsw i32 %306, %45
%310 = sext i32 %309 to i64
%311 = getelementptr inbounds float, float* %165, i64 %310
%312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8
%313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8
%314 = shl i32 %.098108.lcssa, 8
%315 = or i32 %314, 3328
%316 = add nsw i32 %315, %43
%317 = icmp slt i32 %316, %3
br i1 %317, label %318, label %.thread.preheader
; <label>:318: ; preds = %308
%319 = add nsw i32 %316, %45
%320 = sext i32 %319 to i64
%321 = getelementptr inbounds float, float* %165, i64 %320
%322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8
%323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8
%324 = shl i32 %.098108.lcssa, 8
%325 = or i32 %324, 3584
%326 = add nsw i32 %325, %43
%327 = icmp slt i32 %326, %3
br i1 %327, label %328, label %.thread.preheader
; <label>:328: ; preds = %318
%329 = add nsw i32 %326, %45
%330 = sext i32 %329 to i64
%331 = getelementptr inbounds float, float* %165, i64 %330
%332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8
%333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8
br label %.thread.preheader
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 128
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %2, 15
%31 = sdiv i32 %30, 16
%32 = mul nsw i32 %31, %3
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = shl nuw nsw i32 %33, 8
%36 = add nuw nsw i32 %35, %34
%37 = icmp slt i32 %36, %32
br i1 %37, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%38 = icmp sgt i32 %3, -1
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
%40 = load float*, float** %39, align 8
%41 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0
%42 = load float*, float** %41, align 8
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader
.lr.ph.split.preheader: ; preds = %.lr.ph
%43 = add i32 %32, -1
%44 = sub i32 %43, %34
%45 = sub i32 %44, %35
%46 = lshr i32 %45, 15
%47 = add nuw nsw i32 %46, 1
%xtraiter = and i32 %47, 3
%48 = icmp ult i32 %45, 98304
br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new
.lr.ph.split.preheader.new: ; preds = %.lr.ph.split.preheader
%unroll_iter = sub nsw i32 %47, %xtraiter
br label %.lr.ph.split
.lr.ph.split.us.preheader: ; preds = %.lr.ph
br label %.lr.ph.split.us
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us
%.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ]
%49 = srem i32 %.047.us, %3
%50 = sdiv i32 %.047.us, %3
%51 = srem i32 %50, %31
%52 = shl nsw i32 %51, 4
br label %53
; <label>:53: ; preds = %104, %.lr.ph.split.us
%.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ]
%.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ]
%54 = add nuw nsw i32 %.04346.us.us, %52
%55 = icmp slt i32 %54, %2
br i1 %55, label %56, label %62
; <label>:56: ; preds = %53
%57 = mul nsw i32 %54, %3
%58 = add nsw i32 %57, %49
%59 = sext i32 %58 to i64
%60 = getelementptr inbounds float, float* %40, i64 %59
%61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8
br label %62
; <label>:62: ; preds = %56, %53
%63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ]
%64 = fadd float %.04445.us.us, %63
%65 = or i32 %.04346.us.us, 1
%66 = add nuw nsw i32 %65, %52
%67 = icmp slt i32 %66, %2
br i1 %67, label %98, label %104
.us-lcssa.us.us: ; preds = %104
%.lcssa = phi float [ %106, %104 ]
%68 = sext i32 %49 to i64
%69 = getelementptr inbounds float, float* %42, i64 %68
%70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8
%71 = add nuw nsw i32 %.047.us, 32768
%72 = icmp slt i32 %71, %32
br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit
._crit_edge.loopexit: ; preds = %.us-lcssa.us.us
br label %._crit_edge
._crit_edge.loopexit59.unr-lcssa.loopexit: ; preds = %.lr.ph.split
%.lcssa60 = phi i32 [ %97, %.lr.ph.split ]
br label %._crit_edge.loopexit59.unr-lcssa
._crit_edge.loopexit59.unr-lcssa: ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader
%.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ]
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader
.lr.ph.split.epil.preheader: ; preds = %._crit_edge.loopexit59.unr-lcssa
br label %.lr.ph.split.epil
.lr.ph.split.epil: ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader
%.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ]
%epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ]
%73 = srem i32 %.047.epil, %3
%74 = sext i32 %73 to i64
%75 = getelementptr inbounds float, float* %42, i64 %74
%76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8
%77 = add nuw nsw i32 %.047.epil, 32768
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !64
._crit_edge.loopexit59.epilog-lcssa: ; preds = %.lr.ph.split.epil
br label %._crit_edge.loopexit59
._crit_edge.loopexit59: ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29
ret void
.lr.ph.split: ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new
%.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ]
%78 = srem i32 %.047, %3
%79 = sext i32 %78 to i64
%80 = getelementptr inbounds float, float* %42, i64 %79
%81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8
%82 = add nuw nsw i32 %.047, 32768
%83 = srem i32 %82, %3
%84 = sext i32 %83 to i64
%85 = getelementptr inbounds float, float* %42, i64 %84
%86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8
%87 = add nsw i32 %.047, 65536
%88 = srem i32 %87, %3
%89 = sext i32 %88 to i64
%90 = getelementptr inbounds float, float* %42, i64 %89
%91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8
%92 = add nsw i32 %.047, 98304
%93 = srem i32 %92, %3
%94 = sext i32 %93 to i64
%95 = getelementptr inbounds float, float* %42, i64 %94
%96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8
%97 = add nsw i32 %.047, 131072
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split
; <label>:98: ; preds = %62
%99 = mul nsw i32 %66, %3
%100 = add nsw i32 %99, %49
%101 = sext i32 %100 to i64
%102 = getelementptr inbounds float, float* %40, i64 %101
%103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8
br label %104
; <label>:104: ; preds = %98, %62
%105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ]
%106 = fadd float %64, %105
%107 = add nsw i32 %.04346.us.us, 2
%exitcond.1 = icmp eq i32 %107, 16
br i1 %exitcond.1, label %.us-lcssa.us.us, label %53
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 128
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %2, 15
%31 = sdiv i32 %30, 16
%32 = mul nsw i32 %31, %3
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = shl nuw nsw i32 %33, 8
%36 = add nuw nsw i32 %35, %34
%37 = icmp slt i32 %36, %32
br i1 %37, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
%38 = icmp sgt i32 %3, -1
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0
%40 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader
.lr.ph.split.preheader: ; preds = %.lr.ph
br label %.lr.ph.split
.lr.ph.split.us.preheader: ; preds = %.lr.ph
br label %.lr.ph.split.us
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
%.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ]
%41 = srem i32 %.048.us, %3
%42 = sdiv i32 %.048.us, %3
%43 = srem i32 %42, %31
%44 = shl nsw i32 %43, 4
%.idx45.val.us = load float, float* %.idx45, align 4
%45 = load float*, float** %39, align 8
br label %54
; <label>:46: ; preds = %49, %.us-lcssa.us.us
%.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ]
%47 = bitcast i32 %.011.i.us to float
%48 = fcmp olt float %47, %.lcssa
br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
; <label>:49: ; preds = %46
%50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst
%51 = extractvalue { i32, i1 } %50, 0
%not..i.us = icmp eq i32 %.011.i.us, %51
br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46
%52 = add nuw nsw i32 %.048.us, 32768
%53 = icmp slt i32 %52, %32
br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit
; <label>:54: ; preds = %112, %.lr.ph.split.us
%.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ]
%.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ]
%55 = add nuw nsw i32 %.04347.us.us, %44
%56 = icmp slt i32 %55, %2
br i1 %56, label %57, label %63
; <label>:57: ; preds = %54
%58 = mul nsw i32 %55, %3
%59 = add nsw i32 %58, %41
%60 = sext i32 %59 to i64
%61 = getelementptr inbounds float, float* %45, i64 %60
%62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8
br label %63
; <label>:63: ; preds = %54, %57
%64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ]
%65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8
%66 = or i32 %.04347.us.us, 1
%67 = add nuw nsw i32 %66, %44
%68 = icmp slt i32 %67, %2
br i1 %68, label %106, label %112
.us-lcssa.us.us: ; preds = %112
%.lcssa = phi float [ %114, %112 ]
%69 = sext i32 %41 to i64
%70 = load float*, float** %40, align 8
%71 = getelementptr inbounds float, float* %70, i64 %69
%72 = bitcast float %.lcssa to i32
%73 = bitcast float* %71 to i32*
%74 = load i32, i32* %73, align 4
br label %46
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
br label %._crit_edge
._crit_edge.loopexit60: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29
ret void
.lr.ph.split: ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
%.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ]
%.idx45.val = load float, float* %.idx45, align 4
%75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8
%76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8
%77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8
%78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8
%79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8
%80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8
%81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8
%82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8
%83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8
%84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8
%85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8
%86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8
%87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8
%88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8
%89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8
%90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8
%91 = srem i32 %.048, %3
%92 = sext i32 %91 to i64
%93 = load float*, float** %40, align 8
%94 = getelementptr inbounds float, float* %93, i64 %92
%95 = bitcast float %90 to i32
%96 = bitcast float* %94 to i32*
%97 = load i32, i32* %96, align 4
br label %98
; <label>:98: ; preds = %101, %.lr.ph.split
%.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ]
%99 = bitcast i32 %.011.i to float
%100 = fcmp olt float %99, %90
br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
; <label>:101: ; preds = %98
%102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst
%103 = extractvalue { i32, i1 } %102, 0
%not..i = icmp eq i32 %.011.i, %103
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101
%104 = add nuw nsw i32 %.048, 32768
%105 = icmp slt i32 %104, %32
br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60
; <label>:106: ; preds = %63
%107 = mul nsw i32 %67, %3
%108 = add nsw i32 %107, %41
%109 = sext i32 %108 to i64
%110 = getelementptr inbounds float, float* %45, i64 %109
%111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8
br label %112
; <label>:112: ; preds = %106, %63
%113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ]
%114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8
%115 = add nsw i32 %.04347.us.us, 2
%exitcond.1 = icmp eq i32 %115, 16
br i1 %exitcond.1, label %.us-lcssa.us.us, label %54
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, float*) #2 comdat {
%5 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%6 = shl nuw nsw i32 %5, 15
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%8 = or i32 %6, %7
%9 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%10 = icmp eq i32 %9, 1
br i1 %10, label %11, label %15
; <label>:11: ; preds = %4
%12 = icmp eq i32 %8, 0
br i1 %12, label %13, label %14
; <label>:13: ; preds = %11
store float 0.000000e+00, float* %3, align 4
br label %14
; <label>:14: ; preds = %13, %11
tail call void @llvm.cuda.syncthreads()
br label %15
; <label>:15: ; preds = %14, %4
%16 = sub nsw i32 %2, %8
%17 = icmp sgt i32 %16, 32768
%..i = select i1 %17, i32 32768, i32 %16
%18 = icmp sgt i32 %16, 0
br i1 %18, label %.lr.ph, label %.preheader.preheader
.preheader.preheader.loopexit: ; preds = %.epil.preheader
%.lcssa47 = phi float [ %23, %.epil.preheader ]
br label %.preheader.preheader
.preheader.preheader: ; preds = %.preheader.preheader.loopexit, %.preheader.preheader.loopexit.unr-lcssa, %15
%.132.ph = phi float [ 0.000000e+00, %15 ], [ %.lcssa36.ph, %.preheader.preheader.loopexit.unr-lcssa ], [ %.lcssa47, %.preheader.preheader.loopexit ]
br label %.preheader
.preheader.preheader.loopexit.unr-lcssa.loopexit: ; preds = %32
%.lcssa49 = phi i32 [ %80, %32 ]
%.lcssa48 = phi float [ %79, %32 ]
br label %.preheader.preheader.loopexit.unr-lcssa
.preheader.preheader.loopexit.unr-lcssa: ; preds = %.preheader.preheader.loopexit.unr-lcssa.loopexit, %.lr.ph
%.lcssa36.ph = phi float [ undef, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
%.02535.unr = phi i32 [ 0, %.lr.ph ], [ %.lcssa49, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
%.03134.unr = phi float [ 0.000000e+00, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ]
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %.preheader.preheader, label %.epil.preheader.preheader
.epil.preheader.preheader: ; preds = %.preheader.preheader.loopexit.unr-lcssa
br label %.epil.preheader
.epil.preheader: ; preds = %.epil.preheader.preheader, %.epil.preheader
%.02535.epil = phi i32 [ %24, %.epil.preheader ], [ %.02535.unr, %.epil.preheader.preheader ]
%.03134.epil = phi float [ %23, %.epil.preheader ], [ %.03134.unr, %.epil.preheader.preheader ]
%epil.iter = phi i32 [ %epil.iter.sub, %.epil.preheader ], [ %xtraiter, %.epil.preheader.preheader ]
%19 = add nuw nsw i32 %.02535.epil, %8
%20 = sext i32 %19 to i64
%21 = getelementptr inbounds float, float* %26, i64 %20
%22 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %21, i32 4) #8
%23 = fadd float %.03134.epil, %22
%24 = add nuw nsw i32 %.02535.epil, 256
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %.preheader.preheader.loopexit, label %.epil.preheader, !llvm.loop !65
.lr.ph: ; preds = %15
%25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i64 0, i32 10, i32 0
%26 = load float*, float** %25, align 8
%27 = icmp sgt i32 %..i, 256
%smax = select i1 %27, i32 %..i, i32 256
%28 = add i32 %smax, -1
%29 = lshr i32 %28, 8
%30 = add nuw nsw i32 %29, 1
%xtraiter = and i32 %30, 7
%31 = icmp ult i32 %28, 1792
br i1 %31, label %.preheader.preheader.loopexit.unr-lcssa, label %.lr.ph.new
.lr.ph.new: ; preds = %.lr.ph
%unroll_iter = sub nsw i32 %30, %xtraiter
br label %32
; <label>:32: ; preds = %32, %.lr.ph.new
%.02535 = phi i32 [ 0, %.lr.ph.new ], [ %80, %32 ]
%.03134 = phi float [ 0.000000e+00, %.lr.ph.new ], [ %79, %32 ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.new ], [ %niter.nsub.7, %32 ]
%33 = add nuw nsw i32 %.02535, %8
%34 = sext i32 %33 to i64
%35 = getelementptr inbounds float, float* %26, i64 %34
%36 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %35, i32 4) #8
%37 = fadd float %.03134, %36
%38 = or i32 %.02535, 256
%39 = add nuw nsw i32 %38, %8
%40 = sext i32 %39 to i64
%41 = getelementptr inbounds float, float* %26, i64 %40
%42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8
%43 = fadd float %37, %42
%44 = or i32 %.02535, 512
%45 = add nuw nsw i32 %44, %8
%46 = sext i32 %45 to i64
%47 = getelementptr inbounds float, float* %26, i64 %46
%48 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %47, i32 4) #8
%49 = fadd float %43, %48
%50 = or i32 %.02535, 768
%51 = add nuw nsw i32 %50, %8
%52 = sext i32 %51 to i64
%53 = getelementptr inbounds float, float* %26, i64 %52
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
%55 = fadd float %49, %54
%56 = or i32 %.02535, 1024
%57 = add nuw nsw i32 %56, %8
%58 = sext i32 %57 to i64
%59 = getelementptr inbounds float, float* %26, i64 %58
%60 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %59, i32 4) #8
%61 = fadd float %55, %60
%62 = or i32 %.02535, 1280
%63 = add nuw nsw i32 %62, %8
%64 = sext i32 %63 to i64
%65 = getelementptr inbounds float, float* %26, i64 %64
%66 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %65, i32 4) #8
%67 = fadd float %61, %66
%68 = or i32 %.02535, 1536
%69 = add nuw nsw i32 %68, %8
%70 = sext i32 %69 to i64
%71 = getelementptr inbounds float, float* %26, i64 %70
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
%73 = fadd float %67, %72
%74 = or i32 %.02535, 1792
%75 = add nuw nsw i32 %74, %8
%76 = sext i32 %75 to i64
%77 = getelementptr inbounds float, float* %26, i64 %76
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
%79 = fadd float %73, %78
%80 = add nsw i32 %.02535, 2048
%niter.nsub.7 = add i32 %niter, -8
%niter.ncmp.7 = icmp eq i32 %niter.nsub.7, 0
br i1 %niter.ncmp.7, label %.preheader.preheader.loopexit.unr-lcssa.loopexit, label %32, !llvm.loop !66
; <label>:81: ; preds = %.preheader
%.lcssa = phi float [ %85, %.preheader ]
%82 = and i32 %7, 31
%83 = icmp eq i32 %82, 0
br i1 %83, label %88, label %90
.preheader: ; preds = %.preheader.preheader, %.preheader
%.033 = phi i32 [ %86, %.preheader ], [ 16, %.preheader.preheader ]
%.132 = phi float [ %85, %.preheader ], [ %.132.ph, %.preheader.preheader ]
%84 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.132, i32 %.033, i32 31) #3, !srcloc !53
%85 = fadd float %.132, %84
%86 = lshr i32 %.033, 1
%87 = icmp eq i32 %86, 0
br i1 %87, label %81, label %.preheader, !llvm.loop !67
; <label>:88: ; preds = %81
%89 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %3, float %.lcssa) #8
br label %90
; <label>:90: ; preds = %88, %81
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = shl nuw nsw i32 %6, 7
%8 = add i32 %2, -1
%9 = add i32 %8, %7
%10 = udiv i32 %9, %7
%11 = mul nsw i32 %10, %3
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%13 = mul nuw nsw i32 %12, %6
%14 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%15 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%16 = icmp eq i32 %12, 1
br i1 %16, label %22, label %.preheader94
.preheader94.loopexit: ; preds = %.lr.ph109
br label %.preheader94
.preheader94: ; preds = %.preheader94.loopexit, %22, %5
%17 = icmp slt i32 %14, %11
br i1 %17, label %.lr.ph106, label %._crit_edge
.lr.ph106: ; preds = %.preheader94
%18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i64 0, i32 10, i32 0
%19 = load float*, float** %18, align 8
%20 = and i32 %15, 31
%21 = icmp eq i32 %20, 0
br label %30
; <label>:22: ; preds = %5
%23 = mul nuw nsw i32 %14, %6
%24 = add nuw nsw i32 %23, %15
%25 = icmp slt i32 %24, %3
br i1 %25, label %.lr.ph109.preheader, label %.preheader94
.lr.ph109.preheader: ; preds = %22
br label %.lr.ph109
.lr.ph109: ; preds = %.lr.ph109.preheader, %.lr.ph109
%.081107 = phi i32 [ %28, %.lr.ph109 ], [ %24, %.lr.ph109.preheader ]
%26 = sext i32 %.081107 to i64
%27 = getelementptr inbounds float, float* %4, i64 %26
store float 0.000000e+00, float* %27, align 4
%28 = add nsw i32 %.081107, %13
%29 = icmp slt i32 %28, %3
br i1 %29, label %.lr.ph109, label %.preheader94.loopexit
._crit_edge.loopexit: ; preds = %177
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %.preheader94
ret void
; <label>:30: ; preds = %.lr.ph106, %177
%.083105 = phi i32 [ %14, %.lr.ph106 ], [ %178, %177 ]
%31 = sdiv i32 %.083105, %10
%32 = icmp slt i32 %31, %3
br i1 %32, label %33, label %177
; <label>:33: ; preds = %30
%34 = srem i32 %.083105, %10
%35 = mul i32 %7, %34
%36 = add i32 %35, %15
%37 = mul nsw i32 %31, %2
%38 = add i32 %36, %37
br label %39
; <label>:39: ; preds = %33, %.preheader.preheader
%.086100 = phi i32 [ 0, %33 ], [ %40, %.preheader.preheader ]
%.09299 = phi float [ 0.000000e+00, %33 ], [ %155, %.preheader.preheader ]
%40 = add nuw nsw i32 %.086100, 16
%41 = or i32 %.086100, 15
%42 = mul i32 %41, %6
%43 = add i32 %42, %36
%44 = icmp slt i32 %43, %2
%45 = mul i32 %.086100, %6
br i1 %44, label %.preheader.preheader, label %157
.preheader.preheader: ; preds = %39
%46 = add i32 %38, %45
%47 = sext i32 %46 to i64
%48 = getelementptr inbounds float, float* %19, i64 %47
%49 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %48, i32 4) #8
%50 = fadd float %.09299, %49
%51 = or i32 %.086100, 1
%52 = mul i32 %51, %6
%53 = add i32 %38, %52
%54 = sext i32 %53 to i64
%55 = getelementptr inbounds float, float* %19, i64 %54
%56 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %55, i32 4) #8
%57 = fadd float %50, %56
%58 = or i32 %.086100, 2
%59 = mul i32 %58, %6
%60 = add i32 %38, %59
%61 = sext i32 %60 to i64
%62 = getelementptr inbounds float, float* %19, i64 %61
%63 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %62, i32 4) #8
%64 = fadd float %57, %63
%65 = or i32 %.086100, 3
%66 = mul i32 %65, %6
%67 = add i32 %38, %66
%68 = sext i32 %67 to i64
%69 = getelementptr inbounds float, float* %19, i64 %68
%70 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %69, i32 4) #8
%71 = fadd float %64, %70
%72 = or i32 %.086100, 4
%73 = mul i32 %72, %6
%74 = add i32 %38, %73
%75 = sext i32 %74 to i64
%76 = getelementptr inbounds float, float* %19, i64 %75
%77 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %76, i32 4) #8
%78 = fadd float %71, %77
%79 = or i32 %.086100, 5
%80 = mul i32 %79, %6
%81 = add i32 %38, %80
%82 = sext i32 %81 to i64
%83 = getelementptr inbounds float, float* %19, i64 %82
%84 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %83, i32 4) #8
%85 = fadd float %78, %84
%86 = or i32 %.086100, 6
%87 = mul i32 %86, %6
%88 = add i32 %38, %87
%89 = sext i32 %88 to i64
%90 = getelementptr inbounds float, float* %19, i64 %89
%91 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %90, i32 4) #8
%92 = fadd float %85, %91
%93 = or i32 %.086100, 7
%94 = mul i32 %93, %6
%95 = add i32 %38, %94
%96 = sext i32 %95 to i64
%97 = getelementptr inbounds float, float* %19, i64 %96
%98 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %97, i32 4) #8
%99 = fadd float %92, %98
%100 = or i32 %.086100, 8
%101 = mul i32 %100, %6
%102 = add i32 %38, %101
%103 = sext i32 %102 to i64
%104 = getelementptr inbounds float, float* %19, i64 %103
%105 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %104, i32 4) #8
%106 = fadd float %99, %105
%107 = or i32 %.086100, 9
%108 = mul i32 %107, %6
%109 = add i32 %38, %108
%110 = sext i32 %109 to i64
%111 = getelementptr inbounds float, float* %19, i64 %110
%112 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %111, i32 4) #8
%113 = fadd float %106, %112
%114 = or i32 %.086100, 10
%115 = mul i32 %114, %6
%116 = add i32 %38, %115
%117 = sext i32 %116 to i64
%118 = getelementptr inbounds float, float* %19, i64 %117
%119 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %118, i32 4) #8
%120 = fadd float %113, %119
%121 = or i32 %.086100, 11
%122 = mul i32 %121, %6
%123 = add i32 %38, %122
%124 = sext i32 %123 to i64
%125 = getelementptr inbounds float, float* %19, i64 %124
%126 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %125, i32 4) #8
%127 = fadd float %120, %126
%128 = or i32 %.086100, 12
%129 = mul i32 %128, %6
%130 = add i32 %38, %129
%131 = sext i32 %130 to i64
%132 = getelementptr inbounds float, float* %19, i64 %131
%133 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %132, i32 4) #8
%134 = fadd float %127, %133
%135 = or i32 %.086100, 13
%136 = mul i32 %135, %6
%137 = add i32 %38, %136
%138 = sext i32 %137 to i64
%139 = getelementptr inbounds float, float* %19, i64 %138
%140 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %139, i32 4) #8
%141 = fadd float %134, %140
%142 = or i32 %.086100, 14
%143 = mul i32 %142, %6
%144 = add i32 %38, %143
%145 = sext i32 %144 to i64
%146 = getelementptr inbounds float, float* %19, i64 %145
%147 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %146, i32 4) #8
%148 = fadd float %141, %147
%149 = or i32 %.086100, 15
%150 = mul i32 %149, %6
%151 = add i32 %38, %150
%152 = sext i32 %151 to i64
%153 = getelementptr inbounds float, float* %19, i64 %152
%154 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %153, i32 4) #8
%155 = fadd float %148, %154
%156 = icmp slt i32 %40, 128
br i1 %156, label %39, label %.critedge.loopexit125
; <label>:157: ; preds = %39
%.lcssa = phi i32 [ %45, %39 ]
%.09299.lcssa = phi float [ %.09299, %39 ]
%158 = add i32 %.lcssa, %36
%159 = icmp slt i32 %158, %2
br i1 %159, label %.lr.ph.preheader, label %.critedge
.lr.ph.preheader: ; preds = %157
br label %.lr.ph
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph
%.084102 = phi i32 [ %165, %.lr.ph ], [ %158, %.lr.ph.preheader ]
%.1101 = phi float [ %164, %.lr.ph ], [ %.09299.lcssa, %.lr.ph.preheader ]
%160 = add nsw i32 %.084102, %37
%161 = sext i32 %160 to i64
%162 = getelementptr inbounds float, float* %19, i64 %161
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
%164 = fadd float %.1101, %163
%165 = add i32 %.084102, %6
%166 = icmp slt i32 %165, %2
br i1 %166, label %.lr.ph, label %.critedge.loopexit
.critedge.loopexit: ; preds = %.lr.ph
%.lcssa134 = phi float [ %164, %.lr.ph ]
br label %.critedge
.critedge.loopexit125: ; preds = %.preheader.preheader
%.lcssa133 = phi float [ %155, %.preheader.preheader ]
br label %.critedge
.critedge: ; preds = %.critedge.loopexit125, %.critedge.loopexit, %157
%.3 = phi float [ %.09299.lcssa, %157 ], [ %.lcssa134, %.critedge.loopexit ], [ %.lcssa133, %.critedge.loopexit125 ]
tail call void @llvm.cuda.syncthreads()
br label %168
; <label>:167: ; preds = %168
%.lcssa135 = phi float [ %170, %168 ]
br i1 %21, label %173, label %177
; <label>:168: ; preds = %.critedge, %168
%.0104 = phi i32 [ 16, %.critedge ], [ %171, %168 ]
%.4103 = phi float [ %.3, %.critedge ], [ %170, %168 ]
%169 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.4103, i32 %.0104, i32 31) #3, !srcloc !53
%170 = fadd float %.4103, %169
%171 = lshr i32 %.0104, 1
%172 = icmp eq i32 %171, 0
br i1 %172, label %167, label %168
; <label>:173: ; preds = %167
%174 = sext i32 %31 to i64
%175 = getelementptr inbounds float, float* %4, i64 %174
%176 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %175, float %.lcssa135) #8
br label %177
; <label>:177: ; preds = %167, %173, %30
tail call void @llvm.cuda.syncthreads()
%178 = add i32 %.083105, %12
%179 = icmp slt i32 %178, %11
br i1 %179, label %30, label %._crit_edge.loopexit
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%8 = mul nuw nsw i32 %7, %6
%9 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%10 = mul nuw nsw i32 %9, %6
%11 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%12 = add nuw nsw i32 %10, %11
%13 = icmp eq i32 %7, 1
br i1 %13, label %.preheader, label %19
.preheader: ; preds = %5
%14 = icmp slt i32 %12, %3
br i1 %14, label %.lr.ph60.preheader, label %._crit_edge61
.lr.ph60.preheader: ; preds = %.preheader
br label %.lr.ph60
._crit_edge61.loopexit: ; preds = %.lr.ph60
br label %._crit_edge61
._crit_edge61: ; preds = %._crit_edge61.loopexit, %.preheader
tail call void @llvm.cuda.syncthreads()
br label %19
.lr.ph60: ; preds = %.lr.ph60.preheader, %.lr.ph60
%.059 = phi i32 [ %17, %.lr.ph60 ], [ %12, %.lr.ph60.preheader ]
%15 = sext i32 %.059 to i64
%16 = getelementptr inbounds float, float* %4, i64 %15
store float 0.000000e+00, float* %16, align 4
%17 = add nsw i32 %.059, %8
%18 = icmp slt i32 %17, %3
br i1 %18, label %.lr.ph60, label %._crit_edge61.loopexit
; <label>:19: ; preds = %._crit_edge61, %5
%20 = add i32 %2, 15
%21 = sdiv i32 %20, 16
%22 = mul nsw i32 %21, %3
%23 = icmp slt i32 %12, %22
br i1 %23, label %.lr.ph57, label %._crit_edge58
.lr.ph57: ; preds = %19
%24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i64 0, i32 10, i32 0
%25 = load float*, float** %24, align 8
br label %26
._crit_edge58.loopexit: ; preds = %._crit_edge
br label %._crit_edge58
._crit_edge58: ; preds = %._crit_edge58.loopexit, %19
ret void
; <label>:26: ; preds = %.lr.ph57, %._crit_edge
%.04755 = phi i32 [ %12, %.lr.ph57 ], [ %36, %._crit_edge ]
%27 = srem i32 %.04755, %3
%28 = sdiv i32 %.04755, %3
%29 = shl nsw i32 %28, 4
%30 = add nsw i32 %29, 16
%31 = icmp sgt i32 %30, %2
%..i = select i1 %31, i32 %2, i32 %30
%32 = icmp slt i32 %29, %..i
br i1 %32, label %.lr.ph.preheader, label %._crit_edge
.lr.ph.preheader: ; preds = %26
br label %.lr.ph
._crit_edge.loopexit: ; preds = %.lr.ph
%.lcssa = phi float [ %43, %.lr.ph ]
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %26
%.052.lcssa = phi float [ 0.000000e+00, %26 ], [ %.lcssa, %._crit_edge.loopexit ]
%33 = sext i32 %27 to i64
%34 = getelementptr inbounds float, float* %4, i64 %33
%35 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %34, float %.052.lcssa) #8
%36 = add nsw i32 %.04755, %8
%37 = icmp slt i32 %36, %22
br i1 %37, label %26, label %._crit_edge58.loopexit
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph
%.04654 = phi i32 [ %44, %.lr.ph ], [ %29, %.lr.ph.preheader ]
%.05253 = phi float [ %43, %.lr.ph ], [ 0.000000e+00, %.lr.ph.preheader ]
%38 = mul nsw i32 %.04654, %3
%39 = add nsw i32 %38, %27
%40 = sext i32 %39 to i64
%41 = getelementptr inbounds float, float* %25, i64 %40
%42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8
%43 = fadd float %.05253, %42
%44 = add nsw i32 %.04654, 1
%45 = icmp slt i32 %44, %..i
br i1 %45, label %.lr.ph, label %._crit_edge.loopexit
}
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.14"* byval align 8, i32) #0 comdat {
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%5 = mul nuw nsw i32 %4, %3
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%7 = add nuw nsw i32 %5, %6
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%9 = mul nuw nsw i32 %8, %4
%.sroa.444.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 0, i32 7
%.sroa.444.0..sroa_cast = bitcast i32* %.sroa.444.0..sroa_idx to i64*
%.sroa.444.0.copyload = load i64, i64* %.sroa.444.0..sroa_cast, align 8
%.sroa.546.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 0, i32 9, i32 0, i64 0
%.sroa.546.0.copyload = load i32, i32* %.sroa.546.0..sroa_idx, align 8
%.sroa.750.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 0, i32 10, i32 0
%.sroa.750.0.copyload = load float*, float** %.sroa.750.0..sroa_idx, align 8
%.sroa.9.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 2
%.sroa.9.0.copyload = load float*, float** %.sroa.9.0..sroa_idx, align 8
%10 = icmp slt i32 %7, %1
br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit
.lr.ph.i: ; preds = %2
%11 = trunc i64 %.sroa.444.0.copyload to i32
%12 = icmp sgt i32 %.sroa.546.0.copyload, 0
%13 = lshr i64 %.sroa.444.0.copyload, 32
%14 = trunc i64 %13 to i32
br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader
.lr.ph.split.i.preheader: ; preds = %.lr.ph.i
br label %.lr.ph.split.i
.lr.ph.split.us.i.preheader: ; preds = %.lr.ph.i
%15 = add i32 %.sroa.546.0.copyload, -1
%xtraiter = and i32 %.sroa.546.0.copyload, 3
%16 = icmp ult i32 %15, 3
%lcmp.mod = icmp eq i32 %xtraiter, 0
%unroll_iter = sub i32 %.sroa.546.0.copyload, %xtraiter
br label %.lr.ph.split.us.i
.lr.ph.split.us.i: ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
%.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ]
%17 = mul nsw i32 %.07.us.i, %11
br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new
.lr.ph.split.us.i.new: ; preds = %.lr.ph.split.us.i
br label %18
; <label>:18: ; preds = %18, %.lr.ph.split.us.i.new
%19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ]
%.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ]
%20 = mul nsw i32 %.012.i.i.i.us.i, %14
%21 = add nsw i32 %20, %17
%22 = sext i32 %21 to i64
%23 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %22
%24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8
%25 = fadd float %19, %24
%26 = or i32 %.012.i.i.i.us.i, 1
%27 = mul nsw i32 %26, %14
%28 = add nsw i32 %27, %17
%29 = sext i32 %28 to i64
%30 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %29
%31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8
%32 = fadd float %25, %31
%33 = or i32 %.012.i.i.i.us.i, 2
%34 = mul nsw i32 %33, %14
%35 = add nsw i32 %34, %17
%36 = sext i32 %35 to i64
%37 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %36
%38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8
%39 = fadd float %32, %38
%40 = or i32 %.012.i.i.i.us.i, 3
%41 = mul nsw i32 %40, %14
%42 = add nsw i32 %41, %17
%43 = sext i32 %42 to i64
%44 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %43
%45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8
%46 = fadd float %39, %45
%47 = add nsw i32 %.012.i.i.i.us.i, 4
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18
%.lcssa66 = phi i32 [ %47, %18 ]
%.lcssa65 = phi float [ %46, %18 ]
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i
%.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
%.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
%.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader
.epil.preheader: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
br label %48
; <label>:48: ; preds = %48, %.epil.preheader
%49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ]
%.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ]
%epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ]
%50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14
%51 = add nsw i32 %50, %17
%52 = sext i32 %51 to i64
%53 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %52
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
%55 = fadd float %49, %54
%56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !68
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48
%.lcssa67 = phi float [ %55, %48 ]
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa
%.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ]
%57 = sext i32 %.07.us.i to i64
%58 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %57
store float %.lcssa, float* %58, align 4
%59 = add nsw i32 %.07.us.i, %9
%60 = icmp slt i32 %59, %1
br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit
.lr.ph.split.i: ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i
%.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ]
%61 = sext i32 %.07.i to i64
%62 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %61
store float 0.000000e+00, float* %62, align 4
%63 = add nsw i32 %.07.i, %9
%64 = icmp slt i32 %63, %1
br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63: ; preds = %.lr.ph.split.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit, %2
ret void
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 32
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %3, 32767
%31 = sdiv i32 %30, 32768
%32 = mul nsw i32 %31, %2
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = icmp slt i32 %33, %32
br i1 %35, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%36 = and i32 %34, 31
%.not = icmp ne i32 %36, 0
%37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
%38 = load float*, float** %37, align 8
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
%40 = load float*, float** %39, align 8
br label %41
._crit_edge.loopexit: ; preds = %187
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %29
ret void
; <label>:41: ; preds = %.lr.ph, %187
%.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ]
%42 = srem i32 %.0114, %31
%43 = sdiv i32 %.0114, %31
%44 = shl nsw i32 %42, 15
%45 = or i32 %44, %34
%46 = icmp slt i32 %43, %2
br i1 %46, label %.preheader102, label %.thread.preheader
.thread.preheader.loopexit: ; preds = %.preheader.preheader
%.lcssa137 = phi float [ %164, %.preheader.preheader ]
br label %.thread.preheader
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41
%.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
br label %.thread
.preheader102: ; preds = %41
%47 = mul nsw i32 %43, %3
%48 = add i32 %47, %45
br label %49
; <label>:49: ; preds = %.preheader102, %.preheader.preheader
%.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ]
%.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ]
%50 = add nuw nsw i32 %.098108, 16
%51 = shl i32 %.098108, 8
%52 = or i32 %51, 3840
%53 = add nsw i32 %52, %45
%54 = icmp slt i32 %53, %3
br i1 %54, label %.preheader.preheader, label %.preheader101
.preheader.preheader: ; preds = %49
%55 = add i32 %48, %51
%56 = sext i32 %55 to i64
%57 = getelementptr inbounds float, float* %40, i64 %56
%58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8
%59 = fadd float %.095109, %58
%60 = shl i32 %.098108, 8
%61 = or i32 %60, 256
%62 = add i32 %48, %61
%63 = sext i32 %62 to i64
%64 = getelementptr inbounds float, float* %40, i64 %63
%65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8
%66 = fadd float %59, %65
%67 = shl i32 %.098108, 8
%68 = or i32 %67, 512
%69 = add i32 %48, %68
%70 = sext i32 %69 to i64
%71 = getelementptr inbounds float, float* %40, i64 %70
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
%73 = fadd float %66, %72
%74 = shl i32 %.098108, 8
%75 = or i32 %74, 768
%76 = add i32 %48, %75
%77 = sext i32 %76 to i64
%78 = getelementptr inbounds float, float* %40, i64 %77
%79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8
%80 = fadd float %73, %79
%81 = shl i32 %.098108, 8
%82 = or i32 %81, 1024
%83 = add i32 %48, %82
%84 = sext i32 %83 to i64
%85 = getelementptr inbounds float, float* %40, i64 %84
%86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8
%87 = fadd float %80, %86
%88 = shl i32 %.098108, 8
%89 = or i32 %88, 1280
%90 = add i32 %48, %89
%91 = sext i32 %90 to i64
%92 = getelementptr inbounds float, float* %40, i64 %91
%93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8
%94 = fadd float %87, %93
%95 = shl i32 %.098108, 8
%96 = or i32 %95, 1536
%97 = add i32 %48, %96
%98 = sext i32 %97 to i64
%99 = getelementptr inbounds float, float* %40, i64 %98
%100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8
%101 = fadd float %94, %100
%102 = shl i32 %.098108, 8
%103 = or i32 %102, 1792
%104 = add i32 %48, %103
%105 = sext i32 %104 to i64
%106 = getelementptr inbounds float, float* %40, i64 %105
%107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8
%108 = fadd float %101, %107
%109 = shl i32 %.098108, 8
%110 = or i32 %109, 2048
%111 = add i32 %48, %110
%112 = sext i32 %111 to i64
%113 = getelementptr inbounds float, float* %40, i64 %112
%114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8
%115 = fadd float %108, %114
%116 = shl i32 %.098108, 8
%117 = or i32 %116, 2304
%118 = add i32 %48, %117
%119 = sext i32 %118 to i64
%120 = getelementptr inbounds float, float* %40, i64 %119
%121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8
%122 = fadd float %115, %121
%123 = shl i32 %.098108, 8
%124 = or i32 %123, 2560
%125 = add i32 %48, %124
%126 = sext i32 %125 to i64
%127 = getelementptr inbounds float, float* %40, i64 %126
%128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8
%129 = fadd float %122, %128
%130 = shl i32 %.098108, 8
%131 = or i32 %130, 2816
%132 = add i32 %48, %131
%133 = sext i32 %132 to i64
%134 = getelementptr inbounds float, float* %40, i64 %133
%135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8
%136 = fadd float %129, %135
%137 = shl i32 %.098108, 8
%138 = or i32 %137, 3072
%139 = add i32 %48, %138
%140 = sext i32 %139 to i64
%141 = getelementptr inbounds float, float* %40, i64 %140
%142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8
%143 = fadd float %136, %142
%144 = shl i32 %.098108, 8
%145 = or i32 %144, 3328
%146 = add i32 %48, %145
%147 = sext i32 %146 to i64
%148 = getelementptr inbounds float, float* %40, i64 %147
%149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8
%150 = fadd float %143, %149
%151 = shl i32 %.098108, 8
%152 = or i32 %151, 3584
%153 = add i32 %48, %152
%154 = sext i32 %153 to i64
%155 = getelementptr inbounds float, float* %40, i64 %154
%156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8
%157 = fadd float %150, %156
%158 = shl i32 %.098108, 8
%159 = or i32 %158, 3840
%160 = add i32 %48, %159
%161 = sext i32 %160 to i64
%162 = getelementptr inbounds float, float* %40, i64 %161
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
%164 = fadd float %157, %163
%165 = icmp slt i32 %50, 128
br i1 %165, label %49, label %.thread.preheader.loopexit
.preheader101: ; preds = %49
%.lcssa = phi i32 [ %51, %49 ]
%.098108.lcssa = phi i32 [ %.098108, %49 ]
%.095109.lcssa = phi float [ %.095109, %49 ]
%166 = add nsw i32 %.lcssa, %45
%167 = icmp slt i32 %166, %3
br i1 %167, label %168, label %.thread.preheader
; <label>:168: ; preds = %.preheader101
%169 = add nsw i32 %166, %47
%170 = sext i32 %169 to i64
%171 = getelementptr inbounds float, float* %40, i64 %170
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
%173 = fadd float %.095109.lcssa, %172
%174 = shl i32 %.098108.lcssa, 8
%175 = or i32 %174, 256
%176 = add nsw i32 %175, %45
%177 = icmp slt i32 %176, %3
br i1 %177, label %190, label %.thread.preheader
; <label>:178: ; preds = %.thread
%.lcssa138 = phi float [ %180, %.thread ]
%.not99 = xor i1 %46, true
%brmerge = or i1 %.not, %.not99
br i1 %brmerge, label %187, label %183
.thread: ; preds = %.thread.preheader, %.thread
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
%180 = fadd float %.8112, %179
%181 = lshr i32 %.092113, 1
%182 = icmp eq i32 %181, 0
br i1 %182, label %178, label %.thread, !llvm.loop !69
; <label>:183: ; preds = %178
%184 = sext i32 %43 to i64
%185 = getelementptr inbounds float, float* %38, i64 %184
%186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8
br label %187
; <label>:187: ; preds = %178, %183
%188 = add nuw nsw i32 %.0114, 32
%189 = icmp slt i32 %188, %32
br i1 %189, label %41, label %._crit_edge.loopexit
; <label>:190: ; preds = %168
%191 = add nsw i32 %176, %47
%192 = sext i32 %191 to i64
%193 = getelementptr inbounds float, float* %40, i64 %192
%194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8
%195 = fadd float %173, %194
%196 = shl i32 %.098108.lcssa, 8
%197 = or i32 %196, 512
%198 = add nsw i32 %197, %45
%199 = icmp slt i32 %198, %3
br i1 %199, label %200, label %.thread.preheader
; <label>:200: ; preds = %190
%201 = add nsw i32 %198, %47
%202 = sext i32 %201 to i64
%203 = getelementptr inbounds float, float* %40, i64 %202
%204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8
%205 = fadd float %195, %204
%206 = shl i32 %.098108.lcssa, 8
%207 = or i32 %206, 768
%208 = add nsw i32 %207, %45
%209 = icmp slt i32 %208, %3
br i1 %209, label %210, label %.thread.preheader
; <label>:210: ; preds = %200
%211 = add nsw i32 %208, %47
%212 = sext i32 %211 to i64
%213 = getelementptr inbounds float, float* %40, i64 %212
%214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8
%215 = fadd float %205, %214
%216 = shl i32 %.098108.lcssa, 8
%217 = or i32 %216, 1024
%218 = add nsw i32 %217, %45
%219 = icmp slt i32 %218, %3
br i1 %219, label %220, label %.thread.preheader
; <label>:220: ; preds = %210
%221 = add nsw i32 %218, %47
%222 = sext i32 %221 to i64
%223 = getelementptr inbounds float, float* %40, i64 %222
%224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8
%225 = fadd float %215, %224
%226 = shl i32 %.098108.lcssa, 8
%227 = or i32 %226, 1280
%228 = add nsw i32 %227, %45
%229 = icmp slt i32 %228, %3
br i1 %229, label %230, label %.thread.preheader
; <label>:230: ; preds = %220
%231 = add nsw i32 %228, %47
%232 = sext i32 %231 to i64
%233 = getelementptr inbounds float, float* %40, i64 %232
%234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8
%235 = fadd float %225, %234
%236 = shl i32 %.098108.lcssa, 8
%237 = or i32 %236, 1536
%238 = add nsw i32 %237, %45
%239 = icmp slt i32 %238, %3
br i1 %239, label %240, label %.thread.preheader
; <label>:240: ; preds = %230
%241 = add nsw i32 %238, %47
%242 = sext i32 %241 to i64
%243 = getelementptr inbounds float, float* %40, i64 %242
%244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8
%245 = fadd float %235, %244
%246 = shl i32 %.098108.lcssa, 8
%247 = or i32 %246, 1792
%248 = add nsw i32 %247, %45
%249 = icmp slt i32 %248, %3
br i1 %249, label %250, label %.thread.preheader
; <label>:250: ; preds = %240
%251 = add nsw i32 %248, %47
%252 = sext i32 %251 to i64
%253 = getelementptr inbounds float, float* %40, i64 %252
%254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8
%255 = fadd float %245, %254
%256 = shl i32 %.098108.lcssa, 8
%257 = or i32 %256, 2048
%258 = add nsw i32 %257, %45
%259 = icmp slt i32 %258, %3
br i1 %259, label %260, label %.thread.preheader
; <label>:260: ; preds = %250
%261 = add nsw i32 %258, %47
%262 = sext i32 %261 to i64
%263 = getelementptr inbounds float, float* %40, i64 %262
%264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8
%265 = fadd float %255, %264
%266 = shl i32 %.098108.lcssa, 8
%267 = or i32 %266, 2304
%268 = add nsw i32 %267, %45
%269 = icmp slt i32 %268, %3
br i1 %269, label %270, label %.thread.preheader
; <label>:270: ; preds = %260
%271 = add nsw i32 %268, %47
%272 = sext i32 %271 to i64
%273 = getelementptr inbounds float, float* %40, i64 %272
%274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8
%275 = fadd float %265, %274
%276 = shl i32 %.098108.lcssa, 8
%277 = or i32 %276, 2560
%278 = add nsw i32 %277, %45
%279 = icmp slt i32 %278, %3
br i1 %279, label %280, label %.thread.preheader
; <label>:280: ; preds = %270
%281 = add nsw i32 %278, %47
%282 = sext i32 %281 to i64
%283 = getelementptr inbounds float, float* %40, i64 %282
%284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8
%285 = fadd float %275, %284
%286 = shl i32 %.098108.lcssa, 8
%287 = or i32 %286, 2816
%288 = add nsw i32 %287, %45
%289 = icmp slt i32 %288, %3
br i1 %289, label %290, label %.thread.preheader
; <label>:290: ; preds = %280
%291 = add nsw i32 %288, %47
%292 = sext i32 %291 to i64
%293 = getelementptr inbounds float, float* %40, i64 %292
%294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8
%295 = fadd float %285, %294
%296 = shl i32 %.098108.lcssa, 8
%297 = or i32 %296, 3072
%298 = add nsw i32 %297, %45
%299 = icmp slt i32 %298, %3
br i1 %299, label %300, label %.thread.preheader
; <label>:300: ; preds = %290
%301 = add nsw i32 %298, %47
%302 = sext i32 %301 to i64
%303 = getelementptr inbounds float, float* %40, i64 %302
%304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8
%305 = fadd float %295, %304
%306 = shl i32 %.098108.lcssa, 8
%307 = or i32 %306, 3328
%308 = add nsw i32 %307, %45
%309 = icmp slt i32 %308, %3
br i1 %309, label %310, label %.thread.preheader
; <label>:310: ; preds = %300
%311 = add nsw i32 %308, %47
%312 = sext i32 %311 to i64
%313 = getelementptr inbounds float, float* %40, i64 %312
%314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8
%315 = fadd float %305, %314
%316 = shl i32 %.098108.lcssa, 8
%317 = or i32 %316, 3584
%318 = add nsw i32 %317, %45
%319 = icmp slt i32 %318, %3
br i1 %319, label %320, label %.thread.preheader
; <label>:320: ; preds = %310
%321 = add nsw i32 %318, %47
%322 = sext i32 %321 to i64
%323 = getelementptr inbounds float, float* %40, i64 %322
%324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8
%325 = fadd float %315, %324
br label %.thread.preheader
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 32
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %3, 32767
%31 = sdiv i32 %30, 32768
%32 = mul nsw i32 %31, %2
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = icmp slt i32 %33, %32
br i1 %35, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
%36 = and i32 %34, 31
%.not = icmp ne i32 %36, 0
%37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
%38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
br label %39
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %29
ret void
; <label>:39: ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
%.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ]
%40 = srem i32 %.0114, %31
%41 = sdiv i32 %.0114, %31
%42 = shl nsw i32 %40, 15
%43 = or i32 %42, %34
%.idx.val = load float, float* %.idx, align 4
%44 = icmp slt i32 %41, %2
br i1 %44, label %.preheader102, label %.thread.preheader
.thread.preheader.loopexit: ; preds = %.preheader.preheader
%.lcssa137 = phi float [ %163, %.preheader.preheader ]
br label %.thread.preheader
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39
%.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
br label %.thread
.preheader102: ; preds = %39
%45 = mul nsw i32 %41, %3
%46 = add i32 %45, %43
%47 = load float*, float** %38, align 8
br label %48
; <label>:48: ; preds = %.preheader102, %.preheader.preheader
%.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ]
%.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ]
%49 = add nuw nsw i32 %.098108, 16
%50 = shl i32 %.098108, 8
%51 = or i32 %50, 3840
%52 = add nsw i32 %51, %43
%53 = icmp slt i32 %52, %3
br i1 %53, label %.preheader.preheader, label %.preheader101
.preheader.preheader: ; preds = %48
%54 = add i32 %46, %50
%55 = sext i32 %54 to i64
%56 = getelementptr inbounds float, float* %47, i64 %55
%57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8
%58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8
%59 = shl i32 %.098108, 8
%60 = or i32 %59, 256
%61 = add i32 %46, %60
%62 = sext i32 %61 to i64
%63 = getelementptr inbounds float, float* %47, i64 %62
%64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8
%65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8
%66 = shl i32 %.098108, 8
%67 = or i32 %66, 512
%68 = add i32 %46, %67
%69 = sext i32 %68 to i64
%70 = getelementptr inbounds float, float* %47, i64 %69
%71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8
%72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8
%73 = shl i32 %.098108, 8
%74 = or i32 %73, 768
%75 = add i32 %46, %74
%76 = sext i32 %75 to i64
%77 = getelementptr inbounds float, float* %47, i64 %76
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
%79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8
%80 = shl i32 %.098108, 8
%81 = or i32 %80, 1024
%82 = add i32 %46, %81
%83 = sext i32 %82 to i64
%84 = getelementptr inbounds float, float* %47, i64 %83
%85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8
%86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8
%87 = shl i32 %.098108, 8
%88 = or i32 %87, 1280
%89 = add i32 %46, %88
%90 = sext i32 %89 to i64
%91 = getelementptr inbounds float, float* %47, i64 %90
%92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8
%93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8
%94 = shl i32 %.098108, 8
%95 = or i32 %94, 1536
%96 = add i32 %46, %95
%97 = sext i32 %96 to i64
%98 = getelementptr inbounds float, float* %47, i64 %97
%99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8
%100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8
%101 = shl i32 %.098108, 8
%102 = or i32 %101, 1792
%103 = add i32 %46, %102
%104 = sext i32 %103 to i64
%105 = getelementptr inbounds float, float* %47, i64 %104
%106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8
%107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8
%108 = shl i32 %.098108, 8
%109 = or i32 %108, 2048
%110 = add i32 %46, %109
%111 = sext i32 %110 to i64
%112 = getelementptr inbounds float, float* %47, i64 %111
%113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8
%114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8
%115 = shl i32 %.098108, 8
%116 = or i32 %115, 2304
%117 = add i32 %46, %116
%118 = sext i32 %117 to i64
%119 = getelementptr inbounds float, float* %47, i64 %118
%120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8
%121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8
%122 = shl i32 %.098108, 8
%123 = or i32 %122, 2560
%124 = add i32 %46, %123
%125 = sext i32 %124 to i64
%126 = getelementptr inbounds float, float* %47, i64 %125
%127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8
%128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8
%129 = shl i32 %.098108, 8
%130 = or i32 %129, 2816
%131 = add i32 %46, %130
%132 = sext i32 %131 to i64
%133 = getelementptr inbounds float, float* %47, i64 %132
%134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8
%135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8
%136 = shl i32 %.098108, 8
%137 = or i32 %136, 3072
%138 = add i32 %46, %137
%139 = sext i32 %138 to i64
%140 = getelementptr inbounds float, float* %47, i64 %139
%141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8
%142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8
%143 = shl i32 %.098108, 8
%144 = or i32 %143, 3328
%145 = add i32 %46, %144
%146 = sext i32 %145 to i64
%147 = getelementptr inbounds float, float* %47, i64 %146
%148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8
%149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8
%150 = shl i32 %.098108, 8
%151 = or i32 %150, 3584
%152 = add i32 %46, %151
%153 = sext i32 %152 to i64
%154 = getelementptr inbounds float, float* %47, i64 %153
%155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8
%156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8
%157 = shl i32 %.098108, 8
%158 = or i32 %157, 3840
%159 = add i32 %46, %158
%160 = sext i32 %159 to i64
%161 = getelementptr inbounds float, float* %47, i64 %160
%162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8
%163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8
%164 = icmp slt i32 %49, 128
br i1 %164, label %48, label %.thread.preheader.loopexit
.preheader101: ; preds = %48
%.lcssa = phi i32 [ %50, %48 ]
%.098108.lcssa = phi i32 [ %.098108, %48 ]
%.095109.lcssa = phi float [ %.095109, %48 ]
%165 = load float*, float** %38, align 8
%166 = add nsw i32 %.lcssa, %43
%167 = icmp slt i32 %166, %3
br i1 %167, label %168, label %.thread.preheader
; <label>:168: ; preds = %.preheader101
%169 = add nsw i32 %166, %45
%170 = sext i32 %169 to i64
%171 = getelementptr inbounds float, float* %165, i64 %170
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
%173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8
%174 = shl i32 %.098108.lcssa, 8
%175 = or i32 %174, 256
%176 = add nsw i32 %175, %43
%177 = icmp slt i32 %176, %3
br i1 %177, label %198, label %.thread.preheader
; <label>:178: ; preds = %.thread
%.lcssa138 = phi float [ %180, %.thread ]
%.not99 = xor i1 %44, true
%brmerge = or i1 %.not, %.not99
br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183
.thread: ; preds = %.thread.preheader, %.thread
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
%180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8
%181 = lshr i32 %.092113, 1
%182 = icmp eq i32 %181, 0
br i1 %182, label %178, label %.thread, !llvm.loop !70
; <label>:183: ; preds = %178
%184 = load float*, float** %37, align 8
%185 = sext i32 %41 to i64
%186 = getelementptr inbounds float, float* %184, i64 %185
%187 = bitcast float %.lcssa138 to i32
%188 = bitcast float* %186 to i32*
%189 = load i32, i32* %188, align 4
br label %190
; <label>:190: ; preds = %193, %183
%.011.i = phi i32 [ %189, %183 ], [ %195, %193 ]
%191 = bitcast i32 %.011.i to float
%192 = fcmp olt float %191, %.lcssa138
br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit
; <label>:193: ; preds = %190
%194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst
%195 = extractvalue { i32, i1 } %194, 0
%not..i = icmp eq i32 %.011.i, %195
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193
br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178
%196 = add nuw nsw i32 %.0114, 32
%197 = icmp slt i32 %196, %32
br i1 %197, label %39, label %._crit_edge.loopexit
; <label>:198: ; preds = %168
%199 = add nsw i32 %176, %45
%200 = sext i32 %199 to i64
%201 = getelementptr inbounds float, float* %165, i64 %200
%202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8
%203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8
%204 = shl i32 %.098108.lcssa, 8
%205 = or i32 %204, 512
%206 = add nsw i32 %205, %43
%207 = icmp slt i32 %206, %3
br i1 %207, label %208, label %.thread.preheader
; <label>:208: ; preds = %198
%209 = add nsw i32 %206, %45
%210 = sext i32 %209 to i64
%211 = getelementptr inbounds float, float* %165, i64 %210
%212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8
%213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8
%214 = shl i32 %.098108.lcssa, 8
%215 = or i32 %214, 768
%216 = add nsw i32 %215, %43
%217 = icmp slt i32 %216, %3
br i1 %217, label %218, label %.thread.preheader
; <label>:218: ; preds = %208
%219 = add nsw i32 %216, %45
%220 = sext i32 %219 to i64
%221 = getelementptr inbounds float, float* %165, i64 %220
%222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8
%223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8
%224 = shl i32 %.098108.lcssa, 8
%225 = or i32 %224, 1024
%226 = add nsw i32 %225, %43
%227 = icmp slt i32 %226, %3
br i1 %227, label %228, label %.thread.preheader
; <label>:228: ; preds = %218
%229 = add nsw i32 %226, %45
%230 = sext i32 %229 to i64
%231 = getelementptr inbounds float, float* %165, i64 %230
%232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8
%233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8
%234 = shl i32 %.098108.lcssa, 8
%235 = or i32 %234, 1280
%236 = add nsw i32 %235, %43
%237 = icmp slt i32 %236, %3
br i1 %237, label %238, label %.thread.preheader
; <label>:238: ; preds = %228
%239 = add nsw i32 %236, %45
%240 = sext i32 %239 to i64
%241 = getelementptr inbounds float, float* %165, i64 %240
%242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8
%243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8
%244 = shl i32 %.098108.lcssa, 8
%245 = or i32 %244, 1536
%246 = add nsw i32 %245, %43
%247 = icmp slt i32 %246, %3
br i1 %247, label %248, label %.thread.preheader
; <label>:248: ; preds = %238
%249 = add nsw i32 %246, %45
%250 = sext i32 %249 to i64
%251 = getelementptr inbounds float, float* %165, i64 %250
%252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8
%253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8
%254 = shl i32 %.098108.lcssa, 8
%255 = or i32 %254, 1792
%256 = add nsw i32 %255, %43
%257 = icmp slt i32 %256, %3
br i1 %257, label %258, label %.thread.preheader
; <label>:258: ; preds = %248
%259 = add nsw i32 %256, %45
%260 = sext i32 %259 to i64
%261 = getelementptr inbounds float, float* %165, i64 %260
%262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8
%263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8
%264 = shl i32 %.098108.lcssa, 8
%265 = or i32 %264, 2048
%266 = add nsw i32 %265, %43
%267 = icmp slt i32 %266, %3
br i1 %267, label %268, label %.thread.preheader
; <label>:268: ; preds = %258
%269 = add nsw i32 %266, %45
%270 = sext i32 %269 to i64
%271 = getelementptr inbounds float, float* %165, i64 %270
%272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8
%273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8
%274 = shl i32 %.098108.lcssa, 8
%275 = or i32 %274, 2304
%276 = add nsw i32 %275, %43
%277 = icmp slt i32 %276, %3
br i1 %277, label %278, label %.thread.preheader
; <label>:278: ; preds = %268
%279 = add nsw i32 %276, %45
%280 = sext i32 %279 to i64
%281 = getelementptr inbounds float, float* %165, i64 %280
%282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8
%283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8
%284 = shl i32 %.098108.lcssa, 8
%285 = or i32 %284, 2560
%286 = add nsw i32 %285, %43
%287 = icmp slt i32 %286, %3
br i1 %287, label %288, label %.thread.preheader
; <label>:288: ; preds = %278
%289 = add nsw i32 %286, %45
%290 = sext i32 %289 to i64
%291 = getelementptr inbounds float, float* %165, i64 %290
%292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8
%293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8
%294 = shl i32 %.098108.lcssa, 8
%295 = or i32 %294, 2816
%296 = add nsw i32 %295, %43
%297 = icmp slt i32 %296, %3
br i1 %297, label %298, label %.thread.preheader
; <label>:298: ; preds = %288
%299 = add nsw i32 %296, %45
%300 = sext i32 %299 to i64
%301 = getelementptr inbounds float, float* %165, i64 %300
%302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8
%303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8
%304 = shl i32 %.098108.lcssa, 8
%305 = or i32 %304, 3072
%306 = add nsw i32 %305, %43
%307 = icmp slt i32 %306, %3
br i1 %307, label %308, label %.thread.preheader
; <label>:308: ; preds = %298
%309 = add nsw i32 %306, %45
%310 = sext i32 %309 to i64
%311 = getelementptr inbounds float, float* %165, i64 %310
%312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8
%313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8
%314 = shl i32 %.098108.lcssa, 8
%315 = or i32 %314, 3328
%316 = add nsw i32 %315, %43
%317 = icmp slt i32 %316, %3
br i1 %317, label %318, label %.thread.preheader
; <label>:318: ; preds = %308
%319 = add nsw i32 %316, %45
%320 = sext i32 %319 to i64
%321 = getelementptr inbounds float, float* %165, i64 %320
%322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8
%323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8
%324 = shl i32 %.098108.lcssa, 8
%325 = or i32 %324, 3584
%326 = add nsw i32 %325, %43
%327 = icmp slt i32 %326, %3
br i1 %327, label %328, label %.thread.preheader
; <label>:328: ; preds = %318
%329 = add nsw i32 %326, %45
%330 = sext i32 %329 to i64
%331 = getelementptr inbounds float, float* %165, i64 %330
%332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8
%333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8
br label %.thread.preheader
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 128
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %2, 15
%31 = sdiv i32 %30, 16
%32 = mul nsw i32 %31, %3
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = shl nuw nsw i32 %33, 8
%36 = add nuw nsw i32 %35, %34
%37 = icmp slt i32 %36, %32
br i1 %37, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%38 = icmp sgt i32 %3, -1
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
%40 = load float*, float** %39, align 8
%41 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
%42 = load float*, float** %41, align 8
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader
.lr.ph.split.preheader: ; preds = %.lr.ph
%43 = add i32 %32, -1
%44 = sub i32 %43, %34
%45 = sub i32 %44, %35
%46 = lshr i32 %45, 15
%47 = add nuw nsw i32 %46, 1
%xtraiter = and i32 %47, 3
%48 = icmp ult i32 %45, 98304
br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new
.lr.ph.split.preheader.new: ; preds = %.lr.ph.split.preheader
%unroll_iter = sub nsw i32 %47, %xtraiter
br label %.lr.ph.split
.lr.ph.split.us.preheader: ; preds = %.lr.ph
br label %.lr.ph.split.us
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us
%.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ]
%49 = srem i32 %.047.us, %3
%50 = sdiv i32 %.047.us, %3
%51 = srem i32 %50, %31
%52 = shl nsw i32 %51, 4
br label %53
; <label>:53: ; preds = %104, %.lr.ph.split.us
%.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ]
%.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ]
%54 = add nuw nsw i32 %.04346.us.us, %52
%55 = icmp slt i32 %54, %2
br i1 %55, label %56, label %62
; <label>:56: ; preds = %53
%57 = mul nsw i32 %54, %3
%58 = add nsw i32 %57, %49
%59 = sext i32 %58 to i64
%60 = getelementptr inbounds float, float* %40, i64 %59
%61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8
br label %62
; <label>:62: ; preds = %56, %53
%63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ]
%64 = fadd float %.04445.us.us, %63
%65 = or i32 %.04346.us.us, 1
%66 = add nuw nsw i32 %65, %52
%67 = icmp slt i32 %66, %2
br i1 %67, label %98, label %104
.us-lcssa.us.us: ; preds = %104
%.lcssa = phi float [ %106, %104 ]
%68 = sext i32 %49 to i64
%69 = getelementptr inbounds float, float* %42, i64 %68
%70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8
%71 = add nuw nsw i32 %.047.us, 32768
%72 = icmp slt i32 %71, %32
br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit
._crit_edge.loopexit: ; preds = %.us-lcssa.us.us
br label %._crit_edge
._crit_edge.loopexit59.unr-lcssa.loopexit: ; preds = %.lr.ph.split
%.lcssa60 = phi i32 [ %97, %.lr.ph.split ]
br label %._crit_edge.loopexit59.unr-lcssa
._crit_edge.loopexit59.unr-lcssa: ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader
%.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ]
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader
.lr.ph.split.epil.preheader: ; preds = %._crit_edge.loopexit59.unr-lcssa
br label %.lr.ph.split.epil
.lr.ph.split.epil: ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader
%.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ]
%epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ]
%73 = srem i32 %.047.epil, %3
%74 = sext i32 %73 to i64
%75 = getelementptr inbounds float, float* %42, i64 %74
%76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8
%77 = add nuw nsw i32 %.047.epil, 32768
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !71
._crit_edge.loopexit59.epilog-lcssa: ; preds = %.lr.ph.split.epil
br label %._crit_edge.loopexit59
._crit_edge.loopexit59: ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29
ret void
.lr.ph.split: ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new
%.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ]
%78 = srem i32 %.047, %3
%79 = sext i32 %78 to i64
%80 = getelementptr inbounds float, float* %42, i64 %79
%81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8
%82 = add nuw nsw i32 %.047, 32768
%83 = srem i32 %82, %3
%84 = sext i32 %83 to i64
%85 = getelementptr inbounds float, float* %42, i64 %84
%86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8
%87 = add nsw i32 %.047, 65536
%88 = srem i32 %87, %3
%89 = sext i32 %88 to i64
%90 = getelementptr inbounds float, float* %42, i64 %89
%91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8
%92 = add nsw i32 %.047, 98304
%93 = srem i32 %92, %3
%94 = sext i32 %93 to i64
%95 = getelementptr inbounds float, float* %42, i64 %94
%96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8
%97 = add nsw i32 %.047, 131072
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split
; <label>:98: ; preds = %62
%99 = mul nsw i32 %66, %3
%100 = add nsw i32 %99, %49
%101 = sext i32 %100 to i64
%102 = getelementptr inbounds float, float* %40, i64 %101
%103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8
br label %104
; <label>:104: ; preds = %98, %62
%105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ]
%106 = fadd float %64, %105
%107 = add nsw i32 %.04346.us.us, 2
%exitcond.1 = icmp eq i32 %107, 16
br i1 %exitcond.1, label %.us-lcssa.us.us, label %53
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 128
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %2, 15
%31 = sdiv i32 %30, 16
%32 = mul nsw i32 %31, %3
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = shl nuw nsw i32 %33, 8
%36 = add nuw nsw i32 %35, %34
%37 = icmp slt i32 %36, %32
br i1 %37, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
%38 = icmp sgt i32 %3, -1
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
%40 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader
.lr.ph.split.preheader: ; preds = %.lr.ph
br label %.lr.ph.split
.lr.ph.split.us.preheader: ; preds = %.lr.ph
br label %.lr.ph.split.us
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
%.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ]
%41 = srem i32 %.048.us, %3
%42 = sdiv i32 %.048.us, %3
%43 = srem i32 %42, %31
%44 = shl nsw i32 %43, 4
%.idx45.val.us = load float, float* %.idx45, align 4
%45 = load float*, float** %39, align 8
br label %54
; <label>:46: ; preds = %49, %.us-lcssa.us.us
%.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ]
%47 = bitcast i32 %.011.i.us to float
%48 = fcmp olt float %47, %.lcssa
br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
; <label>:49: ; preds = %46
%50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst
%51 = extractvalue { i32, i1 } %50, 0
%not..i.us = icmp eq i32 %.011.i.us, %51
br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46
%52 = add nuw nsw i32 %.048.us, 32768
%53 = icmp slt i32 %52, %32
br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit
; <label>:54: ; preds = %112, %.lr.ph.split.us
%.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ]
%.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ]
%55 = add nuw nsw i32 %.04347.us.us, %44
%56 = icmp slt i32 %55, %2
br i1 %56, label %57, label %63
; <label>:57: ; preds = %54
%58 = mul nsw i32 %55, %3
%59 = add nsw i32 %58, %41
%60 = sext i32 %59 to i64
%61 = getelementptr inbounds float, float* %45, i64 %60
%62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8
br label %63
; <label>:63: ; preds = %54, %57
%64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ]
%65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8
%66 = or i32 %.04347.us.us, 1
%67 = add nuw nsw i32 %66, %44
%68 = icmp slt i32 %67, %2
br i1 %68, label %106, label %112
.us-lcssa.us.us: ; preds = %112
%.lcssa = phi float [ %114, %112 ]
%69 = load float*, float** %40, align 8
%70 = sext i32 %41 to i64
%71 = getelementptr inbounds float, float* %69, i64 %70
%72 = bitcast float %.lcssa to i32
%73 = bitcast float* %71 to i32*
%74 = load i32, i32* %73, align 4
br label %46
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
br label %._crit_edge
._crit_edge.loopexit60: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29
ret void
.lr.ph.split: ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
%.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ]
%.idx45.val = load float, float* %.idx45, align 4
%75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8
%76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8
%77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8
%78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8
%79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8
%80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8
%81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8
%82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8
%83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8
%84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8
%85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8
%86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8
%87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8
%88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8
%89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8
%90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8
%91 = srem i32 %.048, %3
%92 = load float*, float** %40, align 8
%93 = sext i32 %91 to i64
%94 = getelementptr inbounds float, float* %92, i64 %93
%95 = bitcast float %90 to i32
%96 = bitcast float* %94 to i32*
%97 = load i32, i32* %96, align 4
br label %98
; <label>:98: ; preds = %101, %.lr.ph.split
%.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ]
%99 = bitcast i32 %.011.i to float
%100 = fcmp olt float %99, %90
br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
; <label>:101: ; preds = %98
%102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst
%103 = extractvalue { i32, i1 } %102, 0
%not..i = icmp eq i32 %.011.i, %103
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101
%104 = add nuw nsw i32 %.048, 32768
%105 = icmp slt i32 %104, %32
br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60
; <label>:106: ; preds = %63
%107 = mul nsw i32 %67, %3
%108 = add nsw i32 %107, %41
%109 = sext i32 %108 to i64
%110 = getelementptr inbounds float, float* %45, i64 %109
%111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8
br label %112
; <label>:112: ; preds = %106, %63
%113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ]
%114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8
%115 = add nsw i32 %.04347.us.us, 2
%exitcond.1 = icmp eq i32 %115, 16
br i1 %exitcond.1, label %.us-lcssa.us.us, label %54
}
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.15"* byval align 8, i32) #0 comdat {
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%5 = mul nuw nsw i32 %4, %3
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%7 = add nuw nsw i32 %5, %6
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%9 = mul nuw nsw i32 %8, %4
%.sroa.021.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %0, i64 0, i32 0, i32 0
%.sroa.021.0.copyload = load float*, float** %.sroa.021.0..sroa_idx, align 8
%.sroa.5.0..sroa_idx25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %0, i64 0, i32 1, i32 3
%.sroa.5.0.copyload = load float*, float** %.sroa.5.0..sroa_idx25, align 8
%10 = icmp slt i32 %7, %1
br i1 %10, label %.lr.ph.i.preheader, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit
.lr.ph.i.preheader: ; preds = %2
br label %.lr.ph.i
.lr.ph.i: ; preds = %.lr.ph.i.preheader, %.lr.ph.i
%.07.i = phi i32 [ %17, %.lr.ph.i ], [ %7, %.lr.ph.i.preheader ]
%11 = sext i32 %.07.i to i64
%12 = getelementptr inbounds float, float* %.sroa.5.0.copyload, i64 %11
%13 = bitcast float* %12 to i32*
%14 = load i32, i32* %13, align 4
%15 = getelementptr inbounds float, float* %.sroa.021.0.copyload, i64 %11
%16 = bitcast float* %15 to i32*
store i32 %14, i32* %16, align 4
%17 = add nsw i32 %.07.i, %9
%18 = icmp slt i32 %17, %1
br i1 %18, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit: ; preds = %.lr.ph.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit, %2
ret void
}
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.24"* byval align 8, i32) #0 comdat {
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%5 = mul nuw nsw i32 %4, %3
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%7 = add nuw nsw i32 %5, %6
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%9 = mul nuw nsw i32 %8, %4
%.sroa.041.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 0, i32 0
%.sroa.041.0.copyload = load float*, float** %.sroa.041.0..sroa_idx, align 8
%.sroa.545.0..sroa_idx46 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 1, i32 7
%.sroa.545.0..sroa_cast = bitcast i32* %.sroa.545.0..sroa_idx46 to i64*
%.sroa.545.0.copyload = load i64, i64* %.sroa.545.0..sroa_cast, align 8
%.sroa.648.0..sroa_idx49 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 1, i32 9, i32 0, i64 0
%.sroa.648.0.copyload = load i32, i32* %.sroa.648.0..sroa_idx49, align 8
%.sroa.8.0..sroa_idx53 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 1, i32 10, i32 0
%.sroa.8.0.copyload = load float*, float** %.sroa.8.0..sroa_idx53, align 8
%10 = icmp slt i32 %7, %1
br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit
.lr.ph.i: ; preds = %2
%11 = trunc i64 %.sroa.545.0.copyload to i32
%12 = icmp sgt i32 %.sroa.648.0.copyload, 0
%13 = lshr i64 %.sroa.545.0.copyload, 32
%14 = trunc i64 %13 to i32
br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader
.lr.ph.split.i.preheader: ; preds = %.lr.ph.i
br label %.lr.ph.split.i
.lr.ph.split.us.i.preheader: ; preds = %.lr.ph.i
%15 = add i32 %.sroa.648.0.copyload, -1
%xtraiter = and i32 %.sroa.648.0.copyload, 3
%16 = icmp ult i32 %15, 3
%lcmp.mod = icmp eq i32 %xtraiter, 0
%unroll_iter = sub i32 %.sroa.648.0.copyload, %xtraiter
br label %.lr.ph.split.us.i
.lr.ph.split.us.i: ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
%.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ]
%17 = mul nsw i32 %.07.us.i, %11
br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new
.lr.ph.split.us.i.new: ; preds = %.lr.ph.split.us.i
br label %18
; <label>:18: ; preds = %18, %.lr.ph.split.us.i.new
%19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ]
%.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ]
%20 = mul nsw i32 %.012.i.i.i.us.i, %14
%21 = add nsw i32 %20, %17
%22 = sext i32 %21 to i64
%23 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %22
%24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8
%25 = fadd float %19, %24
%26 = or i32 %.012.i.i.i.us.i, 1
%27 = mul nsw i32 %26, %14
%28 = add nsw i32 %27, %17
%29 = sext i32 %28 to i64
%30 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %29
%31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8
%32 = fadd float %25, %31
%33 = or i32 %.012.i.i.i.us.i, 2
%34 = mul nsw i32 %33, %14
%35 = add nsw i32 %34, %17
%36 = sext i32 %35 to i64
%37 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %36
%38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8
%39 = fadd float %32, %38
%40 = or i32 %.012.i.i.i.us.i, 3
%41 = mul nsw i32 %40, %14
%42 = add nsw i32 %41, %17
%43 = sext i32 %42 to i64
%44 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %43
%45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8
%46 = fadd float %39, %45
%47 = add nsw i32 %.012.i.i.i.us.i, 4
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18
%.lcssa67 = phi i32 [ %47, %18 ]
%.lcssa66 = phi float [ %46, %18 ]
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i
%.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
%.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
%.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ]
br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader
.epil.preheader: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa
br label %48
; <label>:48: ; preds = %48, %.epil.preheader
%49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ]
%.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ]
%epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ]
%50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14
%51 = add nsw i32 %50, %17
%52 = sext i32 %51 to i64
%53 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %52
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8
%55 = fadd float %49, %54
%56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !72
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48
%.lcssa68 = phi float [ %55, %48 ]
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa
%.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa68, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ]
%57 = sext i32 %.07.us.i to i64
%58 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %57
store float %.lcssa, float* %58, align 4
%59 = add nsw i32 %.07.us.i, %9
%60 = icmp slt i32 %59, %1
br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit
.lr.ph.split.i: ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i
%.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ]
%61 = sext i32 %.07.i to i64
%62 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %61
store float 0.000000e+00, float* %62, align 4
%63 = add nsw i32 %.07.i, %9
%64 = icmp slt i32 %63, %1
br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64: ; preds = %.lr.ph.split.i
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit, %2
ret void
}
; Function Attrs: nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat {
%4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%6 = mul nuw nsw i32 %5, %4
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%8 = add nuw nsw i32 %6, %7
%9 = icmp slt i32 %8, %1
br i1 %9, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %3
%10 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %2, i64 0, i32 0
%11 = load float*, float** %10, align 8
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%13 = mul nuw nsw i32 %12, %5
br label %14
._crit_edge.loopexit: ; preds = %14
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %3
ret void
; <label>:14: ; preds = %.lr.ph, %14
%.08 = phi i32 [ %8, %.lr.ph ], [ %17, %14 ]
%15 = sext i32 %.08 to i64
%16 = getelementptr inbounds float, float* %11, i64 %15
store float %0, float* %16, align 4
%17 = add i32 %13, %.08
%18 = icmp slt i32 %17, %1
br i1 %18, label %14, label %._crit_edge.loopexit
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 32
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %3, 32767
%31 = sdiv i32 %30, 32768
%32 = mul nsw i32 %31, %2
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = icmp slt i32 %33, %32
br i1 %35, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%36 = and i32 %34, 31
%.not = icmp ne i32 %36, 0
%37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0
%38 = load float*, float** %37, align 8
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
%40 = load float*, float** %39, align 8
br label %41
._crit_edge.loopexit: ; preds = %187
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %29
ret void
; <label>:41: ; preds = %.lr.ph, %187
%.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ]
%42 = srem i32 %.0114, %31
%43 = sdiv i32 %.0114, %31
%44 = shl nsw i32 %42, 15
%45 = or i32 %44, %34
%46 = icmp slt i32 %43, %2
br i1 %46, label %.preheader102, label %.thread.preheader
.thread.preheader.loopexit: ; preds = %.preheader.preheader
%.lcssa137 = phi float [ %164, %.preheader.preheader ]
br label %.thread.preheader
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41
%.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
br label %.thread
.preheader102: ; preds = %41
%47 = mul nsw i32 %43, %3
%48 = add i32 %47, %45
br label %49
; <label>:49: ; preds = %.preheader102, %.preheader.preheader
%.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ]
%.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ]
%50 = add nuw nsw i32 %.098108, 16
%51 = shl i32 %.098108, 8
%52 = or i32 %51, 3840
%53 = add nsw i32 %52, %45
%54 = icmp slt i32 %53, %3
br i1 %54, label %.preheader.preheader, label %.preheader101
.preheader.preheader: ; preds = %49
%55 = add i32 %48, %51
%56 = sext i32 %55 to i64
%57 = getelementptr inbounds float, float* %40, i64 %56
%58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8
%59 = fadd float %.095109, %58
%60 = shl i32 %.098108, 8
%61 = or i32 %60, 256
%62 = add i32 %48, %61
%63 = sext i32 %62 to i64
%64 = getelementptr inbounds float, float* %40, i64 %63
%65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8
%66 = fadd float %59, %65
%67 = shl i32 %.098108, 8
%68 = or i32 %67, 512
%69 = add i32 %48, %68
%70 = sext i32 %69 to i64
%71 = getelementptr inbounds float, float* %40, i64 %70
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8
%73 = fadd float %66, %72
%74 = shl i32 %.098108, 8
%75 = or i32 %74, 768
%76 = add i32 %48, %75
%77 = sext i32 %76 to i64
%78 = getelementptr inbounds float, float* %40, i64 %77
%79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8
%80 = fadd float %73, %79
%81 = shl i32 %.098108, 8
%82 = or i32 %81, 1024
%83 = add i32 %48, %82
%84 = sext i32 %83 to i64
%85 = getelementptr inbounds float, float* %40, i64 %84
%86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8
%87 = fadd float %80, %86
%88 = shl i32 %.098108, 8
%89 = or i32 %88, 1280
%90 = add i32 %48, %89
%91 = sext i32 %90 to i64
%92 = getelementptr inbounds float, float* %40, i64 %91
%93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8
%94 = fadd float %87, %93
%95 = shl i32 %.098108, 8
%96 = or i32 %95, 1536
%97 = add i32 %48, %96
%98 = sext i32 %97 to i64
%99 = getelementptr inbounds float, float* %40, i64 %98
%100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8
%101 = fadd float %94, %100
%102 = shl i32 %.098108, 8
%103 = or i32 %102, 1792
%104 = add i32 %48, %103
%105 = sext i32 %104 to i64
%106 = getelementptr inbounds float, float* %40, i64 %105
%107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8
%108 = fadd float %101, %107
%109 = shl i32 %.098108, 8
%110 = or i32 %109, 2048
%111 = add i32 %48, %110
%112 = sext i32 %111 to i64
%113 = getelementptr inbounds float, float* %40, i64 %112
%114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8
%115 = fadd float %108, %114
%116 = shl i32 %.098108, 8
%117 = or i32 %116, 2304
%118 = add i32 %48, %117
%119 = sext i32 %118 to i64
%120 = getelementptr inbounds float, float* %40, i64 %119
%121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8
%122 = fadd float %115, %121
%123 = shl i32 %.098108, 8
%124 = or i32 %123, 2560
%125 = add i32 %48, %124
%126 = sext i32 %125 to i64
%127 = getelementptr inbounds float, float* %40, i64 %126
%128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8
%129 = fadd float %122, %128
%130 = shl i32 %.098108, 8
%131 = or i32 %130, 2816
%132 = add i32 %48, %131
%133 = sext i32 %132 to i64
%134 = getelementptr inbounds float, float* %40, i64 %133
%135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8
%136 = fadd float %129, %135
%137 = shl i32 %.098108, 8
%138 = or i32 %137, 3072
%139 = add i32 %48, %138
%140 = sext i32 %139 to i64
%141 = getelementptr inbounds float, float* %40, i64 %140
%142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8
%143 = fadd float %136, %142
%144 = shl i32 %.098108, 8
%145 = or i32 %144, 3328
%146 = add i32 %48, %145
%147 = sext i32 %146 to i64
%148 = getelementptr inbounds float, float* %40, i64 %147
%149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8
%150 = fadd float %143, %149
%151 = shl i32 %.098108, 8
%152 = or i32 %151, 3584
%153 = add i32 %48, %152
%154 = sext i32 %153 to i64
%155 = getelementptr inbounds float, float* %40, i64 %154
%156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8
%157 = fadd float %150, %156
%158 = shl i32 %.098108, 8
%159 = or i32 %158, 3840
%160 = add i32 %48, %159
%161 = sext i32 %160 to i64
%162 = getelementptr inbounds float, float* %40, i64 %161
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8
%164 = fadd float %157, %163
%165 = icmp slt i32 %50, 128
br i1 %165, label %49, label %.thread.preheader.loopexit
.preheader101: ; preds = %49
%.lcssa = phi i32 [ %51, %49 ]
%.098108.lcssa = phi i32 [ %.098108, %49 ]
%.095109.lcssa = phi float [ %.095109, %49 ]
%166 = add nsw i32 %.lcssa, %45
%167 = icmp slt i32 %166, %3
br i1 %167, label %168, label %.thread.preheader
; <label>:168: ; preds = %.preheader101
%169 = add nsw i32 %166, %47
%170 = sext i32 %169 to i64
%171 = getelementptr inbounds float, float* %40, i64 %170
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
%173 = fadd float %.095109.lcssa, %172
%174 = shl i32 %.098108.lcssa, 8
%175 = or i32 %174, 256
%176 = add nsw i32 %175, %45
%177 = icmp slt i32 %176, %3
br i1 %177, label %190, label %.thread.preheader
; <label>:178: ; preds = %.thread
%.lcssa138 = phi float [ %180, %.thread ]
%.not99 = xor i1 %46, true
%brmerge = or i1 %.not, %.not99
br i1 %brmerge, label %187, label %183
.thread: ; preds = %.thread.preheader, %.thread
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
%180 = fadd float %.8112, %179
%181 = lshr i32 %.092113, 1
%182 = icmp eq i32 %181, 0
br i1 %182, label %178, label %.thread, !llvm.loop !73
; <label>:183: ; preds = %178
%184 = sext i32 %43 to i64
%185 = getelementptr inbounds float, float* %38, i64 %184
%186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8
br label %187
; <label>:187: ; preds = %178, %183
%188 = add nuw nsw i32 %.0114, 32
%189 = icmp slt i32 %188, %32
br i1 %189, label %41, label %._crit_edge.loopexit
; <label>:190: ; preds = %168
%191 = add nsw i32 %176, %47
%192 = sext i32 %191 to i64
%193 = getelementptr inbounds float, float* %40, i64 %192
%194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8
%195 = fadd float %173, %194
%196 = shl i32 %.098108.lcssa, 8
%197 = or i32 %196, 512
%198 = add nsw i32 %197, %45
%199 = icmp slt i32 %198, %3
br i1 %199, label %200, label %.thread.preheader
; <label>:200: ; preds = %190
%201 = add nsw i32 %198, %47
%202 = sext i32 %201 to i64
%203 = getelementptr inbounds float, float* %40, i64 %202
%204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8
%205 = fadd float %195, %204
%206 = shl i32 %.098108.lcssa, 8
%207 = or i32 %206, 768
%208 = add nsw i32 %207, %45
%209 = icmp slt i32 %208, %3
br i1 %209, label %210, label %.thread.preheader
; <label>:210: ; preds = %200
%211 = add nsw i32 %208, %47
%212 = sext i32 %211 to i64
%213 = getelementptr inbounds float, float* %40, i64 %212
%214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8
%215 = fadd float %205, %214
%216 = shl i32 %.098108.lcssa, 8
%217 = or i32 %216, 1024
%218 = add nsw i32 %217, %45
%219 = icmp slt i32 %218, %3
br i1 %219, label %220, label %.thread.preheader
; <label>:220: ; preds = %210
%221 = add nsw i32 %218, %47
%222 = sext i32 %221 to i64
%223 = getelementptr inbounds float, float* %40, i64 %222
%224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8
%225 = fadd float %215, %224
%226 = shl i32 %.098108.lcssa, 8
%227 = or i32 %226, 1280
%228 = add nsw i32 %227, %45
%229 = icmp slt i32 %228, %3
br i1 %229, label %230, label %.thread.preheader
; <label>:230: ; preds = %220
%231 = add nsw i32 %228, %47
%232 = sext i32 %231 to i64
%233 = getelementptr inbounds float, float* %40, i64 %232
%234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8
%235 = fadd float %225, %234
%236 = shl i32 %.098108.lcssa, 8
%237 = or i32 %236, 1536
%238 = add nsw i32 %237, %45
%239 = icmp slt i32 %238, %3
br i1 %239, label %240, label %.thread.preheader
; <label>:240: ; preds = %230
%241 = add nsw i32 %238, %47
%242 = sext i32 %241 to i64
%243 = getelementptr inbounds float, float* %40, i64 %242
%244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8
%245 = fadd float %235, %244
%246 = shl i32 %.098108.lcssa, 8
%247 = or i32 %246, 1792
%248 = add nsw i32 %247, %45
%249 = icmp slt i32 %248, %3
br i1 %249, label %250, label %.thread.preheader
; <label>:250: ; preds = %240
%251 = add nsw i32 %248, %47
%252 = sext i32 %251 to i64
%253 = getelementptr inbounds float, float* %40, i64 %252
%254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8
%255 = fadd float %245, %254
%256 = shl i32 %.098108.lcssa, 8
%257 = or i32 %256, 2048
%258 = add nsw i32 %257, %45
%259 = icmp slt i32 %258, %3
br i1 %259, label %260, label %.thread.preheader
; <label>:260: ; preds = %250
%261 = add nsw i32 %258, %47
%262 = sext i32 %261 to i64
%263 = getelementptr inbounds float, float* %40, i64 %262
%264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8
%265 = fadd float %255, %264
%266 = shl i32 %.098108.lcssa, 8
%267 = or i32 %266, 2304
%268 = add nsw i32 %267, %45
%269 = icmp slt i32 %268, %3
br i1 %269, label %270, label %.thread.preheader
; <label>:270: ; preds = %260
%271 = add nsw i32 %268, %47
%272 = sext i32 %271 to i64
%273 = getelementptr inbounds float, float* %40, i64 %272
%274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8
%275 = fadd float %265, %274
%276 = shl i32 %.098108.lcssa, 8
%277 = or i32 %276, 2560
%278 = add nsw i32 %277, %45
%279 = icmp slt i32 %278, %3
br i1 %279, label %280, label %.thread.preheader
; <label>:280: ; preds = %270
%281 = add nsw i32 %278, %47
%282 = sext i32 %281 to i64
%283 = getelementptr inbounds float, float* %40, i64 %282
%284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8
%285 = fadd float %275, %284
%286 = shl i32 %.098108.lcssa, 8
%287 = or i32 %286, 2816
%288 = add nsw i32 %287, %45
%289 = icmp slt i32 %288, %3
br i1 %289, label %290, label %.thread.preheader
; <label>:290: ; preds = %280
%291 = add nsw i32 %288, %47
%292 = sext i32 %291 to i64
%293 = getelementptr inbounds float, float* %40, i64 %292
%294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8
%295 = fadd float %285, %294
%296 = shl i32 %.098108.lcssa, 8
%297 = or i32 %296, 3072
%298 = add nsw i32 %297, %45
%299 = icmp slt i32 %298, %3
br i1 %299, label %300, label %.thread.preheader
; <label>:300: ; preds = %290
%301 = add nsw i32 %298, %47
%302 = sext i32 %301 to i64
%303 = getelementptr inbounds float, float* %40, i64 %302
%304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8
%305 = fadd float %295, %304
%306 = shl i32 %.098108.lcssa, 8
%307 = or i32 %306, 3328
%308 = add nsw i32 %307, %45
%309 = icmp slt i32 %308, %3
br i1 %309, label %310, label %.thread.preheader
; <label>:310: ; preds = %300
%311 = add nsw i32 %308, %47
%312 = sext i32 %311 to i64
%313 = getelementptr inbounds float, float* %40, i64 %312
%314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8
%315 = fadd float %305, %314
%316 = shl i32 %.098108.lcssa, 8
%317 = or i32 %316, 3584
%318 = add nsw i32 %317, %45
%319 = icmp slt i32 %318, %3
br i1 %319, label %320, label %.thread.preheader
; <label>:320: ; preds = %310
%321 = add nsw i32 %318, %47
%322 = sext i32 %321 to i64
%323 = getelementptr inbounds float, float* %40, i64 %322
%324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8
%325 = fadd float %315, %324
br label %.thread.preheader
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 32
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %3, 32767
%31 = sdiv i32 %30, 32768
%32 = mul nsw i32 %31, %2
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = icmp slt i32 %33, %32
br i1 %35, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
%36 = and i32 %34, 31
%.not = icmp ne i32 %36, 0
%37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0
%38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
br label %39
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit, %29
ret void
; <label>:39: ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
%.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ]
%40 = srem i32 %.0114, %31
%41 = sdiv i32 %.0114, %31
%42 = shl nsw i32 %40, 15
%43 = or i32 %42, %34
%.idx.val = load float, float* %.idx, align 4
%44 = icmp slt i32 %41, %2
br i1 %44, label %.preheader102, label %.thread.preheader
.thread.preheader.loopexit: ; preds = %.preheader.preheader
%.lcssa137 = phi float [ %163, %.preheader.preheader ]
br label %.thread.preheader
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39
%.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ]
br label %.thread
.preheader102: ; preds = %39
%45 = mul nsw i32 %41, %3
%46 = add i32 %45, %43
%47 = load float*, float** %38, align 8
br label %48
; <label>:48: ; preds = %.preheader102, %.preheader.preheader
%.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ]
%.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ]
%49 = add nuw nsw i32 %.098108, 16
%50 = shl i32 %.098108, 8
%51 = or i32 %50, 3840
%52 = add nsw i32 %51, %43
%53 = icmp slt i32 %52, %3
br i1 %53, label %.preheader.preheader, label %.preheader101
.preheader.preheader: ; preds = %48
%54 = add i32 %46, %50
%55 = sext i32 %54 to i64
%56 = getelementptr inbounds float, float* %47, i64 %55
%57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8
%58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8
%59 = shl i32 %.098108, 8
%60 = or i32 %59, 256
%61 = add i32 %46, %60
%62 = sext i32 %61 to i64
%63 = getelementptr inbounds float, float* %47, i64 %62
%64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8
%65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8
%66 = shl i32 %.098108, 8
%67 = or i32 %66, 512
%68 = add i32 %46, %67
%69 = sext i32 %68 to i64
%70 = getelementptr inbounds float, float* %47, i64 %69
%71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8
%72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8
%73 = shl i32 %.098108, 8
%74 = or i32 %73, 768
%75 = add i32 %46, %74
%76 = sext i32 %75 to i64
%77 = getelementptr inbounds float, float* %47, i64 %76
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8
%79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8
%80 = shl i32 %.098108, 8
%81 = or i32 %80, 1024
%82 = add i32 %46, %81
%83 = sext i32 %82 to i64
%84 = getelementptr inbounds float, float* %47, i64 %83
%85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8
%86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8
%87 = shl i32 %.098108, 8
%88 = or i32 %87, 1280
%89 = add i32 %46, %88
%90 = sext i32 %89 to i64
%91 = getelementptr inbounds float, float* %47, i64 %90
%92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8
%93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8
%94 = shl i32 %.098108, 8
%95 = or i32 %94, 1536
%96 = add i32 %46, %95
%97 = sext i32 %96 to i64
%98 = getelementptr inbounds float, float* %47, i64 %97
%99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8
%100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8
%101 = shl i32 %.098108, 8
%102 = or i32 %101, 1792
%103 = add i32 %46, %102
%104 = sext i32 %103 to i64
%105 = getelementptr inbounds float, float* %47, i64 %104
%106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8
%107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8
%108 = shl i32 %.098108, 8
%109 = or i32 %108, 2048
%110 = add i32 %46, %109
%111 = sext i32 %110 to i64
%112 = getelementptr inbounds float, float* %47, i64 %111
%113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8
%114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8
%115 = shl i32 %.098108, 8
%116 = or i32 %115, 2304
%117 = add i32 %46, %116
%118 = sext i32 %117 to i64
%119 = getelementptr inbounds float, float* %47, i64 %118
%120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8
%121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8
%122 = shl i32 %.098108, 8
%123 = or i32 %122, 2560
%124 = add i32 %46, %123
%125 = sext i32 %124 to i64
%126 = getelementptr inbounds float, float* %47, i64 %125
%127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8
%128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8
%129 = shl i32 %.098108, 8
%130 = or i32 %129, 2816
%131 = add i32 %46, %130
%132 = sext i32 %131 to i64
%133 = getelementptr inbounds float, float* %47, i64 %132
%134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8
%135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8
%136 = shl i32 %.098108, 8
%137 = or i32 %136, 3072
%138 = add i32 %46, %137
%139 = sext i32 %138 to i64
%140 = getelementptr inbounds float, float* %47, i64 %139
%141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8
%142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8
%143 = shl i32 %.098108, 8
%144 = or i32 %143, 3328
%145 = add i32 %46, %144
%146 = sext i32 %145 to i64
%147 = getelementptr inbounds float, float* %47, i64 %146
%148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8
%149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8
%150 = shl i32 %.098108, 8
%151 = or i32 %150, 3584
%152 = add i32 %46, %151
%153 = sext i32 %152 to i64
%154 = getelementptr inbounds float, float* %47, i64 %153
%155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8
%156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8
%157 = shl i32 %.098108, 8
%158 = or i32 %157, 3840
%159 = add i32 %46, %158
%160 = sext i32 %159 to i64
%161 = getelementptr inbounds float, float* %47, i64 %160
%162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8
%163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8
%164 = icmp slt i32 %49, 128
br i1 %164, label %48, label %.thread.preheader.loopexit
.preheader101: ; preds = %48
%.lcssa = phi i32 [ %50, %48 ]
%.098108.lcssa = phi i32 [ %.098108, %48 ]
%.095109.lcssa = phi float [ %.095109, %48 ]
%165 = load float*, float** %38, align 8
%166 = add nsw i32 %.lcssa, %43
%167 = icmp slt i32 %166, %3
br i1 %167, label %168, label %.thread.preheader
; <label>:168: ; preds = %.preheader101
%169 = add nsw i32 %166, %45
%170 = sext i32 %169 to i64
%171 = getelementptr inbounds float, float* %165, i64 %170
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8
%173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8
%174 = shl i32 %.098108.lcssa, 8
%175 = or i32 %174, 256
%176 = add nsw i32 %175, %43
%177 = icmp slt i32 %176, %3
br i1 %177, label %198, label %.thread.preheader
; <label>:178: ; preds = %.thread
%.lcssa138 = phi float [ %180, %.thread ]
%.not99 = xor i1 %44, true
%brmerge = or i1 %.not, %.not99
br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183
.thread: ; preds = %.thread.preheader, %.thread
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ]
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ]
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53
%180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8
%181 = lshr i32 %.092113, 1
%182 = icmp eq i32 %181, 0
br i1 %182, label %178, label %.thread, !llvm.loop !74
; <label>:183: ; preds = %178
%184 = sext i32 %41 to i64
%185 = load float*, float** %37, align 8
%186 = getelementptr inbounds float, float* %185, i64 %184
%187 = bitcast float %.lcssa138 to i32
%188 = bitcast float* %186 to i32*
%189 = load i32, i32* %188, align 4
br label %190
; <label>:190: ; preds = %193, %183
%.011.i = phi i32 [ %189, %183 ], [ %195, %193 ]
%191 = bitcast i32 %.011.i to float
%192 = fcmp olt float %191, %.lcssa138
br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit
; <label>:193: ; preds = %190
%194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst
%195 = extractvalue { i32, i1 } %194, 0
%not..i = icmp eq i32 %.011.i, %195
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193
br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178
%196 = add nuw nsw i32 %.0114, 32
%197 = icmp slt i32 %196, %32
br i1 %197, label %39, label %._crit_edge.loopexit
; <label>:198: ; preds = %168
%199 = add nsw i32 %176, %45
%200 = sext i32 %199 to i64
%201 = getelementptr inbounds float, float* %165, i64 %200
%202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8
%203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8
%204 = shl i32 %.098108.lcssa, 8
%205 = or i32 %204, 512
%206 = add nsw i32 %205, %43
%207 = icmp slt i32 %206, %3
br i1 %207, label %208, label %.thread.preheader
; <label>:208: ; preds = %198
%209 = add nsw i32 %206, %45
%210 = sext i32 %209 to i64
%211 = getelementptr inbounds float, float* %165, i64 %210
%212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8
%213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8
%214 = shl i32 %.098108.lcssa, 8
%215 = or i32 %214, 768
%216 = add nsw i32 %215, %43
%217 = icmp slt i32 %216, %3
br i1 %217, label %218, label %.thread.preheader
; <label>:218: ; preds = %208
%219 = add nsw i32 %216, %45
%220 = sext i32 %219 to i64
%221 = getelementptr inbounds float, float* %165, i64 %220
%222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8
%223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8
%224 = shl i32 %.098108.lcssa, 8
%225 = or i32 %224, 1024
%226 = add nsw i32 %225, %43
%227 = icmp slt i32 %226, %3
br i1 %227, label %228, label %.thread.preheader
; <label>:228: ; preds = %218
%229 = add nsw i32 %226, %45
%230 = sext i32 %229 to i64
%231 = getelementptr inbounds float, float* %165, i64 %230
%232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8
%233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8
%234 = shl i32 %.098108.lcssa, 8
%235 = or i32 %234, 1280
%236 = add nsw i32 %235, %43
%237 = icmp slt i32 %236, %3
br i1 %237, label %238, label %.thread.preheader
; <label>:238: ; preds = %228
%239 = add nsw i32 %236, %45
%240 = sext i32 %239 to i64
%241 = getelementptr inbounds float, float* %165, i64 %240
%242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8
%243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8
%244 = shl i32 %.098108.lcssa, 8
%245 = or i32 %244, 1536
%246 = add nsw i32 %245, %43
%247 = icmp slt i32 %246, %3
br i1 %247, label %248, label %.thread.preheader
; <label>:248: ; preds = %238
%249 = add nsw i32 %246, %45
%250 = sext i32 %249 to i64
%251 = getelementptr inbounds float, float* %165, i64 %250
%252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8
%253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8
%254 = shl i32 %.098108.lcssa, 8
%255 = or i32 %254, 1792
%256 = add nsw i32 %255, %43
%257 = icmp slt i32 %256, %3
br i1 %257, label %258, label %.thread.preheader
; <label>:258: ; preds = %248
%259 = add nsw i32 %256, %45
%260 = sext i32 %259 to i64
%261 = getelementptr inbounds float, float* %165, i64 %260
%262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8
%263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8
%264 = shl i32 %.098108.lcssa, 8
%265 = or i32 %264, 2048
%266 = add nsw i32 %265, %43
%267 = icmp slt i32 %266, %3
br i1 %267, label %268, label %.thread.preheader
; <label>:268: ; preds = %258
%269 = add nsw i32 %266, %45
%270 = sext i32 %269 to i64
%271 = getelementptr inbounds float, float* %165, i64 %270
%272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8
%273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8
%274 = shl i32 %.098108.lcssa, 8
%275 = or i32 %274, 2304
%276 = add nsw i32 %275, %43
%277 = icmp slt i32 %276, %3
br i1 %277, label %278, label %.thread.preheader
; <label>:278: ; preds = %268
%279 = add nsw i32 %276, %45
%280 = sext i32 %279 to i64
%281 = getelementptr inbounds float, float* %165, i64 %280
%282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8
%283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8
%284 = shl i32 %.098108.lcssa, 8
%285 = or i32 %284, 2560
%286 = add nsw i32 %285, %43
%287 = icmp slt i32 %286, %3
br i1 %287, label %288, label %.thread.preheader
; <label>:288: ; preds = %278
%289 = add nsw i32 %286, %45
%290 = sext i32 %289 to i64
%291 = getelementptr inbounds float, float* %165, i64 %290
%292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8
%293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8
%294 = shl i32 %.098108.lcssa, 8
%295 = or i32 %294, 2816
%296 = add nsw i32 %295, %43
%297 = icmp slt i32 %296, %3
br i1 %297, label %298, label %.thread.preheader
; <label>:298: ; preds = %288
%299 = add nsw i32 %296, %45
%300 = sext i32 %299 to i64
%301 = getelementptr inbounds float, float* %165, i64 %300
%302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8
%303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8
%304 = shl i32 %.098108.lcssa, 8
%305 = or i32 %304, 3072
%306 = add nsw i32 %305, %43
%307 = icmp slt i32 %306, %3
br i1 %307, label %308, label %.thread.preheader
; <label>:308: ; preds = %298
%309 = add nsw i32 %306, %45
%310 = sext i32 %309 to i64
%311 = getelementptr inbounds float, float* %165, i64 %310
%312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8
%313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8
%314 = shl i32 %.098108.lcssa, 8
%315 = or i32 %314, 3328
%316 = add nsw i32 %315, %43
%317 = icmp slt i32 %316, %3
br i1 %317, label %318, label %.thread.preheader
; <label>:318: ; preds = %308
%319 = add nsw i32 %316, %45
%320 = sext i32 %319 to i64
%321 = getelementptr inbounds float, float* %165, i64 %320
%322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8
%323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8
%324 = shl i32 %.098108.lcssa, 8
%325 = or i32 %324, 3584
%326 = add nsw i32 %325, %43
%327 = icmp slt i32 %326, %3
br i1 %327, label %328, label %.thread.preheader
; <label>:328: ; preds = %318
%329 = add nsw i32 %326, %45
%330 = sext i32 %329 to i64
%331 = getelementptr inbounds float, float* %165, i64 %330
%332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8
%333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8
br label %.thread.preheader
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 128
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %2, 15
%31 = sdiv i32 %30, 16
%32 = mul nsw i32 %31, %3
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = shl nuw nsw i32 %33, 8
%36 = add nuw nsw i32 %35, %34
%37 = icmp slt i32 %36, %32
br i1 %37, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%38 = icmp sgt i32 %3, -1
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
%40 = load float*, float** %39, align 8
%41 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0
%42 = load float*, float** %41, align 8
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader
.lr.ph.split.preheader: ; preds = %.lr.ph
%43 = add i32 %32, -1
%44 = sub i32 %43, %34
%45 = sub i32 %44, %35
%46 = lshr i32 %45, 15
%47 = add nuw nsw i32 %46, 1
%xtraiter = and i32 %47, 3
%48 = icmp ult i32 %45, 98304
br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new
.lr.ph.split.preheader.new: ; preds = %.lr.ph.split.preheader
%unroll_iter = sub nsw i32 %47, %xtraiter
br label %.lr.ph.split
.lr.ph.split.us.preheader: ; preds = %.lr.ph
br label %.lr.ph.split.us
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us
%.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ]
%49 = srem i32 %.047.us, %3
%50 = sdiv i32 %.047.us, %3
%51 = srem i32 %50, %31
%52 = shl nsw i32 %51, 4
br label %53
; <label>:53: ; preds = %104, %.lr.ph.split.us
%.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ]
%.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ]
%54 = add nuw nsw i32 %.04346.us.us, %52
%55 = icmp slt i32 %54, %2
br i1 %55, label %56, label %62
; <label>:56: ; preds = %53
%57 = mul nsw i32 %54, %3
%58 = add nsw i32 %57, %49
%59 = sext i32 %58 to i64
%60 = getelementptr inbounds float, float* %40, i64 %59
%61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8
br label %62
; <label>:62: ; preds = %56, %53
%63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ]
%64 = fadd float %.04445.us.us, %63
%65 = or i32 %.04346.us.us, 1
%66 = add nuw nsw i32 %65, %52
%67 = icmp slt i32 %66, %2
br i1 %67, label %98, label %104
.us-lcssa.us.us: ; preds = %104
%.lcssa = phi float [ %106, %104 ]
%68 = sext i32 %49 to i64
%69 = getelementptr inbounds float, float* %42, i64 %68
%70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8
%71 = add nuw nsw i32 %.047.us, 32768
%72 = icmp slt i32 %71, %32
br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit
._crit_edge.loopexit: ; preds = %.us-lcssa.us.us
br label %._crit_edge
._crit_edge.loopexit59.unr-lcssa.loopexit: ; preds = %.lr.ph.split
%.lcssa60 = phi i32 [ %97, %.lr.ph.split ]
br label %._crit_edge.loopexit59.unr-lcssa
._crit_edge.loopexit59.unr-lcssa: ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader
%.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ]
%lcmp.mod = icmp eq i32 %xtraiter, 0
br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader
.lr.ph.split.epil.preheader: ; preds = %._crit_edge.loopexit59.unr-lcssa
br label %.lr.ph.split.epil
.lr.ph.split.epil: ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader
%.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ]
%epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ]
%73 = srem i32 %.047.epil, %3
%74 = sext i32 %73 to i64
%75 = getelementptr inbounds float, float* %42, i64 %74
%76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8
%77 = add nuw nsw i32 %.047.epil, 32768
%epil.iter.sub = add i32 %epil.iter, -1
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !75
._crit_edge.loopexit59.epilog-lcssa: ; preds = %.lr.ph.split.epil
br label %._crit_edge.loopexit59
._crit_edge.loopexit59: ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29
ret void
.lr.ph.split: ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new
%.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ]
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ]
%78 = srem i32 %.047, %3
%79 = sext i32 %78 to i64
%80 = getelementptr inbounds float, float* %42, i64 %79
%81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8
%82 = add nuw nsw i32 %.047, 32768
%83 = srem i32 %82, %3
%84 = sext i32 %83 to i64
%85 = getelementptr inbounds float, float* %42, i64 %84
%86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8
%87 = add nsw i32 %.047, 65536
%88 = srem i32 %87, %3
%89 = sext i32 %88 to i64
%90 = getelementptr inbounds float, float* %42, i64 %89
%91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8
%92 = add nsw i32 %.047, 98304
%93 = srem i32 %92, %3
%94 = sext i32 %93 to i64
%95 = getelementptr inbounds float, float* %42, i64 %94
%96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8
%97 = add nsw i32 %.047, 131072
%niter.nsub.3 = add i32 %niter, -4
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split
; <label>:98: ; preds = %62
%99 = mul nsw i32 %66, %3
%100 = add nsw i32 %99, %49
%101 = sext i32 %100 to i64
%102 = getelementptr inbounds float, float* %40, i64 %101
%103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8
br label %104
; <label>:104: ; preds = %98, %62
%105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ]
%106 = fadd float %64, %105
%107 = add nsw i32 %.04346.us.us, 2
%exitcond.1 = icmp eq i32 %107, 16
br i1 %exitcond.1, label %.us-lcssa.us.us, label %53
}
; Function Attrs: convergent nounwind
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat {
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47
%7 = icmp eq i32 %6, 256
br i1 %7, label %9, label %8
; <label>:8: ; preds = %5
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:9: ; preds = %5
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47
%11 = icmp eq i32 %10, 1
br i1 %11, label %13, label %12
; <label>:12: ; preds = %9
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:13: ; preds = %9
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57
%15 = icmp eq i32 %14, 1
br i1 %15, label %17, label %16
; <label>:16: ; preds = %13
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:17: ; preds = %13
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49
%19 = icmp eq i32 %18, 128
br i1 %19, label %21, label %20
; <label>:20: ; preds = %17
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:21: ; preds = %17
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49
%23 = icmp eq i32 %22, 1
br i1 %23, label %25, label %24
; <label>:24: ; preds = %21
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:25: ; preds = %21
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49
%27 = icmp eq i32 %26, 1
br i1 %27, label %29, label %28
; <label>:28: ; preds = %25
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9
unreachable
; <label>:29: ; preds = %25
%30 = add nsw i32 %2, 15
%31 = sdiv i32 %30, 16
%32 = mul nsw i32 %31, %3
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48
%35 = shl nuw nsw i32 %33, 8
%36 = add nuw nsw i32 %35, %34
%37 = icmp slt i32 %36, %32
br i1 %37, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %29
%.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0
%38 = icmp sgt i32 %3, -1
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0
%40 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader
.lr.ph.split.preheader: ; preds = %.lr.ph
br label %.lr.ph.split
.lr.ph.split.us.preheader: ; preds = %.lr.ph
br label %.lr.ph.split.us
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
%.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ]
%41 = srem i32 %.048.us, %3
%42 = sdiv i32 %.048.us, %3
%43 = srem i32 %42, %31
%44 = shl nsw i32 %43, 4
%.idx45.val.us = load float, float* %.idx45, align 4
%45 = load float*, float** %39, align 8
br label %54
; <label>:46: ; preds = %49, %.us-lcssa.us.us
%.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ]
%47 = bitcast i32 %.011.i.us to float
%48 = fcmp olt float %47, %.lcssa
br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
; <label>:49: ; preds = %46
%50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst
%51 = extractvalue { i32, i1 } %50, 0
%not..i.us = icmp eq i32 %.011.i.us, %51
br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46
%52 = add nuw nsw i32 %.048.us, 32768
%53 = icmp slt i32 %52, %32
br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit
; <label>:54: ; preds = %112, %.lr.ph.split.us
%.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ]
%.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ]
%55 = add nuw nsw i32 %.04347.us.us, %44
%56 = icmp slt i32 %55, %2
br i1 %56, label %57, label %63
; <label>:57: ; preds = %54
%58 = mul nsw i32 %55, %3
%59 = add nsw i32 %58, %41
%60 = sext i32 %59 to i64
%61 = getelementptr inbounds float, float* %45, i64 %60
%62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8
br label %63
; <label>:63: ; preds = %54, %57
%64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ]
%65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8
%66 = or i32 %.04347.us.us, 1
%67 = add nuw nsw i32 %66, %44
%68 = icmp slt i32 %67, %2
br i1 %68, label %106, label %112
.us-lcssa.us.us: ; preds = %112
%.lcssa = phi float [ %114, %112 ]
%69 = sext i32 %41 to i64
%70 = load float*, float** %40, align 8
%71 = getelementptr inbounds float, float* %70, i64 %69
%72 = bitcast float %.lcssa to i32
%73 = bitcast float* %71 to i32*
%74 = load i32, i32* %73, align 4
br label %46
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us
br label %._crit_edge
._crit_edge.loopexit60: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
br label %._crit_edge
._crit_edge: ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29
ret void
.lr.ph.split: ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
%.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ]
%.idx45.val = load float, float* %.idx45, align 4
%75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8
%76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8
%77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8
%78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8
%79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8
%80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8
%81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8
%82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8
%83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8
%84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8
%85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8
%86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8
%87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8
%88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8
%89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8
%90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8
%91 = srem i32 %.048, %3
%92 = sext i32 %91 to i64
%93 = load float*, float** %40, align 8
%94 = getelementptr inbounds float, float* %93, i64 %92
%95 = bitcast float %90 to i32
%96 = bitcast float* %94 to i32*
%97 = load i32, i32* %96, align 4
br label %98
; <label>:98: ; preds = %101, %.lr.ph.split
%.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ]
%99 = bitcast i32 %.011.i to float
%100 = fcmp olt float %99, %90
br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit
; <label>:101: ; preds = %98
%102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst
%103 = extractvalue { i32, i1 } %102, 0
%not..i = icmp eq i32 %.011.i, %103
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101
%104 = add nuw nsw i32 %.048, 32768
%105 = icmp slt i32 %104, %32
br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60
; <label>:106: ; preds = %63
%107 = mul nsw i32 %67, %3
%108 = add nsw i32 %107, %41
%109 = sext i32 %108 to i64
%110 = getelementptr inbounds float, float* %45, i64 %109
%111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8
br label %112
; <label>:112: ; preds = %106, %63
%113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ]
%114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8
%115 = add nsw i32 %.04347.us.us, 2
%exitcond.1 = icmp eq i32 %115, 16
br i1 %exitcond.1, label %.us-lcssa.us.us, label %54
}
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.fmax.f(float, float) #1
attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { convergent nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { convergent nounwind }
attributes #4 = { argmemonly nounwind readonly }
attributes #5 = { argmemonly nounwind }
attributes #6 = { convergent inlinehint noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #7 = { convergent noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #8 = { nounwind }
attributes #9 = { convergent }
attributes #10 = { convergent noreturn nounwind }
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !38, !40, !40, !40, !40, !41, !41, !40}
!llvm.module.flags = !{!42, !43}
!llvm.ident = !{!44}
!nvvm.internalize.after.link = !{}
!nvvmir.version = !{!45}
!0 = !{void (float, i32, float*)* @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_, !"kernel", i32 1}
!1 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!2 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!3 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!4 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!5 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!6 = !{void (float, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_, !"kernel", i32 1}
!7 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!8 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!9 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!10 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!11 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!12 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!13 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!14 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!15 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1}
!16 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!17 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!18 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!19 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!20 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!21 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!22 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1}
!23 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!24 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!25 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!26 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!27 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!28 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!29 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!30 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!31 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
!32 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
!33 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1}
!34 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!35 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!36 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!37 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1}
!38 = !{null, !"align", i32 8}
!39 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!40 = !{null, !"align", i32 16}
!41 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!42 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!43 = !{i32 1, !"PIC Level", i32 2}
!44 = !{!"clang version google3-trunk (trunk r271374)"}
!45 = !{i32 1, i32 2}
!46 = !{i32 0, i32 65535}
!47 = !{i32 1, i32 1025}
!48 = !{i32 0, i32 1024}
!49 = !{i32 1, i32 65536}
!50 = distinct !{!50, !51}
!51 = !{!"llvm.loop.unroll.disable"}
!52 = distinct !{!52, !51}
!53 = !{i32 457534}
!54 = distinct !{!54, !55}
!55 = !{!"llvm.loop.unroll.enable"}
!56 = distinct !{!56, !51}
!57 = !{i32 1, i32 65}
!58 = distinct !{!58, !55}
!59 = distinct !{!59, !55}
!60 = distinct !{!60, !51}
!61 = distinct !{!61, !51}
!62 = distinct !{!62, !55}
!63 = distinct !{!63, !55}
!64 = distinct !{!64, !51}
!65 = distinct !{!65, !51}
!66 = distinct !{!66, !51}
!67 = distinct !{!67, !55}
!68 = distinct !{!68, !51}
!69 = distinct !{!69, !55}
!70 = distinct !{!70, !55}
!71 = distinct !{!71, !51}
!72 = distinct !{!72, !51}
!73 = distinct !{!73, !55}
!74 = distinct !{!74, !55}
!75 = distinct !{!75, !51}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment