/-
Created
June 1, 2016 22:31
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = '<stdin>' | |
source_filename = "cxx11_tensor_reduction_cuda-sm_35.cui" | |
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" | |
target triple = "nvptx64-nvidia-cuda" | |
%"struct.Eigen::internal::SumReducer" = type { i8 } | |
%"struct.Eigen::TensorEvaluator" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* } | |
%"class.Eigen::array" = type { [2 x i8] } | |
%"struct.Eigen::DSizes" = type { %"class.Eigen::array.0" } | |
%"class.Eigen::array.1" = type { [2 x i32] } | |
%"class.Eigen::array.2" = type { [1 x %"struct.Eigen::internal::TensorIntDivisor"] } | |
%"struct.Eigen::internal::TensorIntDivisor" = type { i32, i32, i32 } | |
%"class.Eigen::array.0" = type { [1 x i32] } | |
%"struct.Eigen::TensorEvaluator.3" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::DSizes.4" = type { %"class.Eigen::array.1" } | |
%"struct.Eigen::GpuDevice" = type { %"class.Eigen::StreamInterface"* } | |
%"class.Eigen::StreamInterface" = type { i32 (...)** } | |
%"struct.Eigen::TensorEvaluator.5" = type { %"struct.Eigen::TensorEvaluator", %"struct.Eigen::GpuDevice"*, float* } | |
%"struct.Eigen::internal::PtrWrapper" = type { float* } | |
%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" } | |
%"struct.Eigen::internal::(anonymous namespace)::FnSumReducer" = type { %"struct.Eigen::internal::(anonymous namespace)::Identity" } | |
%"struct.Eigen::internal::(anonymous namespace)::Identity" = type { i8 } | |
%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer" = type { float } | |
%"struct.Eigen::TensorEvaluator.6" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.8" } | |
%"struct.Eigen::TensorEvaluator.7" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::TensorEvaluator.8" = type { %"struct.Eigen::TensorEvaluator", %"class.Eigen::TensorReductionOp", %"struct.Eigen::GpuDevice"*, float* } | |
%"class.Eigen::TensorReductionOp" = type <{ %"class.Eigen::TensorMap"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }> | |
%"class.Eigen::TensorMap" = type { float*, %"struct.Eigen::DSizes.4" } | |
%"struct.Eigen::TensorEvaluator.11" = type { %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator" } | |
%"struct.Eigen::TensorEvaluator.12" = type { %"class.Eigen::array", %"struct.Eigen::DSizes", %"class.Eigen::array.1", %"class.Eigen::array.0", %"class.Eigen::array.2", %"class.Eigen::array.0", %"class.Eigen::array.0", i32, %"class.Eigen::array.0", %"class.Eigen::array.0", %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::internal::SumReducer", float*, i64, %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::TensorEvaluator.13" = type { float*, %"struct.Eigen::DSizes.4", %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::TensorEvaluator.14" = type { %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::GpuDevice"*, float* } | |
%"struct.Eigen::TensorEvaluator.15" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.17" } | |
%"struct.Eigen::TensorEvaluator.16" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"* } | |
%"struct.Eigen::TensorEvaluator.17" = type { %"struct.Eigen::TensorEvaluator.12", %"class.Eigen::TensorReductionOp.18", %"struct.Eigen::GpuDevice"*, float* } | |
%"class.Eigen::TensorReductionOp.18" = type <{ %"class.Eigen::TensorMap.20"*, %"class.Eigen::array.0", %"struct.Eigen::internal::SumReducer", [3 x i8] }> | |
%"class.Eigen::TensorMap.20" = type { float*, %"struct.Eigen::DSizes.4" } | |
%"struct.Eigen::TensorEvaluator.24" = type { %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.12" } | |
$_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_ = comdat any | |
$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = comdat any | |
$_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = comdat any | |
@.str = private unnamed_addr constant [24 x i8] c"blockDim.x == BLOCK_DIM\00", align 1 | |
@.str.1 = private unnamed_addr constant [76 x i8] c"third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@.str.2 = private unnamed_addr constant [16 x i8] c"blockDim.y == 1\00", align 1 | |
@.str.3 = private unnamed_addr constant [16 x i8] c"blockDim.z == 1\00", align 1 | |
@.str.4 = private unnamed_addr constant [22 x i8] c"gridDim.x == GRID_DIM\00", align 1 | |
@.str.5 = private unnamed_addr constant [15 x i8] c"gridDim.y == 1\00", align 1 | |
@.str.6 = private unnamed_addr constant [15 x i8] c"gridDim.z == 1\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [382 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [385 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::internal::PtrWrapper<float, int>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [437 x i8] c"void Eigen::internal::(anonymous namespace)::RowReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 32, BLOCK_DIM = 256, NUM_PER_THREAD = 128, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaSumReducer]\00", align 1 | |
@__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_ = private unnamed_addr constant [440 x i8] c"void Eigen::internal::(anonymous namespace)::ColumnReduceKernel(Reducer, const Input, int, int, Output) [GRID_DIM = 128, BLOCK_DIM = 256, NUM_PER_THREAD = 16, Input = Eigen::TensorEvaluator<const Eigen::TensorMap<Eigen::Tensor<float, 2, 0, int>, 0>, Eigen::GpuDevice>, Output = Eigen::TensorEvaluator<Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, 0>, Eigen::GpuDevice>, Reducer = Eigen::internal::(anonymous namespace)::CudaMaxReducer]\00", align 1 | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_(float, i32, float*) #0 comdat { | |
%4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%6 = mul nuw nsw i32 %5, %4 | |
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%8 = add nuw nsw i32 %6, %7 | |
%9 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%10 = mul nuw nsw i32 %9, %5 | |
%11 = icmp slt i32 %8, %1 | |
br i1 %11, label %.lr.ph.preheader, label %._crit_edge | |
.lr.ph.preheader: ; preds = %3 | |
br label %.lr.ph | |
._crit_edge.loopexit: ; preds = %.lr.ph | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %3 | |
ret void | |
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph | |
%.012 = phi i32 [ %14, %.lr.ph ], [ %8, %.lr.ph.preheader ] | |
%12 = sext i32 %.012 to i64 | |
%13 = getelementptr inbounds float, float* %2, i64 %12 | |
store float %0, float* %13, align 4 | |
%14 = add nsw i32 %.012, %10 | |
%15 = icmp slt i32 %14, %1 | |
br i1 %15, label %.lr.ph, label %._crit_edge.loopexit | |
} | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.ctaid.x() #1 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.ntid.x() #1 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.tid.x() #1 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.nctaid.x() #1 | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, float*) #2 comdat { | |
%5 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%6 = shl nuw nsw i32 %5, 15 | |
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%8 = or i32 %6, %7 | |
%9 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%10 = icmp eq i32 %9, 1 | |
br i1 %10, label %11, label %15 | |
; <label>:11: ; preds = %4 | |
%12 = icmp eq i32 %8, 0 | |
br i1 %12, label %13, label %14 | |
; <label>:13: ; preds = %11 | |
store float 0.000000e+00, float* %3, align 4 | |
br label %14 | |
; <label>:14: ; preds = %13, %11 | |
tail call void @llvm.cuda.syncthreads() | |
br label %15 | |
; <label>:15: ; preds = %14, %4 | |
%16 = sub nsw i32 %2, %8 | |
%17 = icmp sgt i32 %16, 32768 | |
%..i = select i1 %17, i32 32768, i32 %16 | |
%18 = icmp sgt i32 %16, 0 | |
br i1 %18, label %.lr.ph, label %.preheader.preheader | |
.preheader.preheader.loopexit: ; preds = %.epil.preheader | |
%.lcssa47 = phi float [ %23, %.epil.preheader ] | |
br label %.preheader.preheader | |
.preheader.preheader: ; preds = %.preheader.preheader.loopexit, %.preheader.preheader.loopexit.unr-lcssa, %15 | |
%.132.ph = phi float [ 0.000000e+00, %15 ], [ %.lcssa36.ph, %.preheader.preheader.loopexit.unr-lcssa ], [ %.lcssa47, %.preheader.preheader.loopexit ] | |
br label %.preheader | |
.preheader.preheader.loopexit.unr-lcssa.loopexit: ; preds = %32 | |
%.lcssa49 = phi i32 [ %80, %32 ] | |
%.lcssa48 = phi float [ %79, %32 ] | |
br label %.preheader.preheader.loopexit.unr-lcssa | |
.preheader.preheader.loopexit.unr-lcssa: ; preds = %.preheader.preheader.loopexit.unr-lcssa.loopexit, %.lr.ph | |
%.lcssa36.ph = phi float [ undef, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ] | |
%.02535.unr = phi i32 [ 0, %.lr.ph ], [ %.lcssa49, %.preheader.preheader.loopexit.unr-lcssa.loopexit ] | |
%.03134.unr = phi float [ 0.000000e+00, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ] | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
br i1 %lcmp.mod, label %.preheader.preheader, label %.epil.preheader.preheader | |
.epil.preheader.preheader: ; preds = %.preheader.preheader.loopexit.unr-lcssa | |
br label %.epil.preheader | |
.epil.preheader: ; preds = %.epil.preheader.preheader, %.epil.preheader | |
%.02535.epil = phi i32 [ %24, %.epil.preheader ], [ %.02535.unr, %.epil.preheader.preheader ] | |
%.03134.epil = phi float [ %23, %.epil.preheader ], [ %.03134.unr, %.epil.preheader.preheader ] | |
%epil.iter = phi i32 [ %epil.iter.sub, %.epil.preheader ], [ %xtraiter, %.epil.preheader.preheader ] | |
%19 = add nuw nsw i32 %.02535.epil, %8 | |
%20 = sext i32 %19 to i64 | |
%21 = getelementptr inbounds float, float* %26, i64 %20 | |
%22 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %21, i32 4) #8 | |
%23 = fadd float %.03134.epil, %22 | |
%24 = add nuw nsw i32 %.02535.epil, 256 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %.preheader.preheader.loopexit, label %.epil.preheader, !llvm.loop !50 | |
.lr.ph: ; preds = %15 | |
%25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i64 0, i32 10, i32 0 | |
%26 = load float*, float** %25, align 8 | |
%27 = icmp sgt i32 %..i, 256 | |
%smax = select i1 %27, i32 %..i, i32 256 | |
%28 = add i32 %smax, -1 | |
%29 = lshr i32 %28, 8 | |
%30 = add nuw nsw i32 %29, 1 | |
%xtraiter = and i32 %30, 7 | |
%31 = icmp ult i32 %28, 1792 | |
br i1 %31, label %.preheader.preheader.loopexit.unr-lcssa, label %.lr.ph.new | |
.lr.ph.new: ; preds = %.lr.ph | |
%unroll_iter = sub nsw i32 %30, %xtraiter | |
br label %32 | |
; <label>:32: ; preds = %32, %.lr.ph.new | |
%.02535 = phi i32 [ 0, %.lr.ph.new ], [ %80, %32 ] | |
%.03134 = phi float [ 0.000000e+00, %.lr.ph.new ], [ %79, %32 ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.new ], [ %niter.nsub.7, %32 ] | |
%33 = add nuw nsw i32 %.02535, %8 | |
%34 = sext i32 %33 to i64 | |
%35 = getelementptr inbounds float, float* %26, i64 %34 | |
%36 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %35, i32 4) #8 | |
%37 = fadd float %.03134, %36 | |
%38 = or i32 %.02535, 256 | |
%39 = add nuw nsw i32 %38, %8 | |
%40 = sext i32 %39 to i64 | |
%41 = getelementptr inbounds float, float* %26, i64 %40 | |
%42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8 | |
%43 = fadd float %37, %42 | |
%44 = or i32 %.02535, 512 | |
%45 = add nuw nsw i32 %44, %8 | |
%46 = sext i32 %45 to i64 | |
%47 = getelementptr inbounds float, float* %26, i64 %46 | |
%48 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %47, i32 4) #8 | |
%49 = fadd float %43, %48 | |
%50 = or i32 %.02535, 768 | |
%51 = add nuw nsw i32 %50, %8 | |
%52 = sext i32 %51 to i64 | |
%53 = getelementptr inbounds float, float* %26, i64 %52 | |
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8 | |
%55 = fadd float %49, %54 | |
%56 = or i32 %.02535, 1024 | |
%57 = add nuw nsw i32 %56, %8 | |
%58 = sext i32 %57 to i64 | |
%59 = getelementptr inbounds float, float* %26, i64 %58 | |
%60 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %59, i32 4) #8 | |
%61 = fadd float %55, %60 | |
%62 = or i32 %.02535, 1280 | |
%63 = add nuw nsw i32 %62, %8 | |
%64 = sext i32 %63 to i64 | |
%65 = getelementptr inbounds float, float* %26, i64 %64 | |
%66 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %65, i32 4) #8 | |
%67 = fadd float %61, %66 | |
%68 = or i32 %.02535, 1536 | |
%69 = add nuw nsw i32 %68, %8 | |
%70 = sext i32 %69 to i64 | |
%71 = getelementptr inbounds float, float* %26, i64 %70 | |
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8 | |
%73 = fadd float %67, %72 | |
%74 = or i32 %.02535, 1792 | |
%75 = add nuw nsw i32 %74, %8 | |
%76 = sext i32 %75 to i64 | |
%77 = getelementptr inbounds float, float* %26, i64 %76 | |
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8 | |
%79 = fadd float %73, %78 | |
%80 = add nsw i32 %.02535, 2048 | |
%niter.nsub.7 = add i32 %niter, -8 | |
%niter.ncmp.7 = icmp eq i32 %niter.nsub.7, 0 | |
br i1 %niter.ncmp.7, label %.preheader.preheader.loopexit.unr-lcssa.loopexit, label %32, !llvm.loop !52 | |
; <label>:81: ; preds = %.preheader | |
%.lcssa = phi float [ %85, %.preheader ] | |
%82 = and i32 %7, 31 | |
%83 = icmp eq i32 %82, 0 | |
br i1 %83, label %88, label %90 | |
.preheader: ; preds = %.preheader.preheader, %.preheader | |
%.033 = phi i32 [ %86, %.preheader ], [ 16, %.preheader.preheader ] | |
%.132 = phi float [ %85, %.preheader ], [ %.132.ph, %.preheader.preheader ] | |
%84 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.132, i32 %.033, i32 31) #3, !srcloc !53 | |
%85 = fadd float %.132, %84 | |
%86 = lshr i32 %.033, 1 | |
%87 = icmp eq i32 %86, 0 | |
br i1 %87, label %81, label %.preheader, !llvm.loop !54 | |
; <label>:88: ; preds = %81 | |
%89 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %3, float %.lcssa) #8 | |
br label %90 | |
; <label>:90: ; preds = %88, %81 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
declare void @llvm.cuda.syncthreads() #3 | |
; Function Attrs: argmemonly nounwind readonly | |
declare float @llvm.nvvm.ldg.global.f.f32.p0f32(float* nocapture, i32) #4 | |
; Function Attrs: argmemonly nounwind | |
declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* nocapture, float) #5 | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = shl nuw nsw i32 %6, 7 | |
%8 = add i32 %2, -1 | |
%9 = add i32 %8, %7 | |
%10 = udiv i32 %9, %7 | |
%11 = mul nsw i32 %10, %3 | |
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%13 = mul nuw nsw i32 %12, %6 | |
%14 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%15 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%16 = icmp eq i32 %12, 1 | |
br i1 %16, label %22, label %.preheader94 | |
.preheader94.loopexit: ; preds = %.lr.ph109 | |
br label %.preheader94 | |
.preheader94: ; preds = %.preheader94.loopexit, %22, %5 | |
%17 = icmp slt i32 %14, %11 | |
br i1 %17, label %.lr.ph106, label %._crit_edge | |
.lr.ph106: ; preds = %.preheader94 | |
%18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i64 0, i32 10, i32 0 | |
%19 = load float*, float** %18, align 8 | |
%20 = and i32 %15, 31 | |
%21 = icmp eq i32 %20, 0 | |
br label %30 | |
; <label>:22: ; preds = %5 | |
%23 = mul nuw nsw i32 %14, %6 | |
%24 = add nuw nsw i32 %23, %15 | |
%25 = icmp slt i32 %24, %3 | |
br i1 %25, label %.lr.ph109.preheader, label %.preheader94 | |
.lr.ph109.preheader: ; preds = %22 | |
br label %.lr.ph109 | |
.lr.ph109: ; preds = %.lr.ph109.preheader, %.lr.ph109 | |
%.081107 = phi i32 [ %28, %.lr.ph109 ], [ %24, %.lr.ph109.preheader ] | |
%26 = sext i32 %.081107 to i64 | |
%27 = getelementptr inbounds float, float* %4, i64 %26 | |
store float 0.000000e+00, float* %27, align 4 | |
%28 = add nsw i32 %.081107, %13 | |
%29 = icmp slt i32 %28, %3 | |
br i1 %29, label %.lr.ph109, label %.preheader94.loopexit | |
._crit_edge.loopexit: ; preds = %177 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %.preheader94 | |
ret void | |
; <label>:30: ; preds = %.lr.ph106, %177 | |
%.083105 = phi i32 [ %14, %.lr.ph106 ], [ %178, %177 ] | |
%31 = sdiv i32 %.083105, %10 | |
%32 = icmp slt i32 %31, %3 | |
br i1 %32, label %33, label %177 | |
; <label>:33: ; preds = %30 | |
%34 = srem i32 %.083105, %10 | |
%35 = mul i32 %7, %34 | |
%36 = add i32 %35, %15 | |
%37 = mul nsw i32 %31, %2 | |
%38 = add i32 %36, %37 | |
br label %39 | |
; <label>:39: ; preds = %33, %.preheader.preheader | |
%.086100 = phi i32 [ 0, %33 ], [ %40, %.preheader.preheader ] | |
%.09299 = phi float [ 0.000000e+00, %33 ], [ %155, %.preheader.preheader ] | |
%40 = add nuw nsw i32 %.086100, 16 | |
%41 = or i32 %.086100, 15 | |
%42 = mul i32 %41, %6 | |
%43 = add i32 %42, %36 | |
%44 = icmp slt i32 %43, %2 | |
%45 = mul i32 %.086100, %6 | |
br i1 %44, label %.preheader.preheader, label %157 | |
.preheader.preheader: ; preds = %39 | |
%46 = add i32 %38, %45 | |
%47 = sext i32 %46 to i64 | |
%48 = getelementptr inbounds float, float* %19, i64 %47 | |
%49 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %48, i32 4) #8 | |
%50 = fadd float %.09299, %49 | |
%51 = or i32 %.086100, 1 | |
%52 = mul i32 %51, %6 | |
%53 = add i32 %38, %52 | |
%54 = sext i32 %53 to i64 | |
%55 = getelementptr inbounds float, float* %19, i64 %54 | |
%56 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %55, i32 4) #8 | |
%57 = fadd float %50, %56 | |
%58 = or i32 %.086100, 2 | |
%59 = mul i32 %58, %6 | |
%60 = add i32 %38, %59 | |
%61 = sext i32 %60 to i64 | |
%62 = getelementptr inbounds float, float* %19, i64 %61 | |
%63 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %62, i32 4) #8 | |
%64 = fadd float %57, %63 | |
%65 = or i32 %.086100, 3 | |
%66 = mul i32 %65, %6 | |
%67 = add i32 %38, %66 | |
%68 = sext i32 %67 to i64 | |
%69 = getelementptr inbounds float, float* %19, i64 %68 | |
%70 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %69, i32 4) #8 | |
%71 = fadd float %64, %70 | |
%72 = or i32 %.086100, 4 | |
%73 = mul i32 %72, %6 | |
%74 = add i32 %38, %73 | |
%75 = sext i32 %74 to i64 | |
%76 = getelementptr inbounds float, float* %19, i64 %75 | |
%77 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %76, i32 4) #8 | |
%78 = fadd float %71, %77 | |
%79 = or i32 %.086100, 5 | |
%80 = mul i32 %79, %6 | |
%81 = add i32 %38, %80 | |
%82 = sext i32 %81 to i64 | |
%83 = getelementptr inbounds float, float* %19, i64 %82 | |
%84 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %83, i32 4) #8 | |
%85 = fadd float %78, %84 | |
%86 = or i32 %.086100, 6 | |
%87 = mul i32 %86, %6 | |
%88 = add i32 %38, %87 | |
%89 = sext i32 %88 to i64 | |
%90 = getelementptr inbounds float, float* %19, i64 %89 | |
%91 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %90, i32 4) #8 | |
%92 = fadd float %85, %91 | |
%93 = or i32 %.086100, 7 | |
%94 = mul i32 %93, %6 | |
%95 = add i32 %38, %94 | |
%96 = sext i32 %95 to i64 | |
%97 = getelementptr inbounds float, float* %19, i64 %96 | |
%98 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %97, i32 4) #8 | |
%99 = fadd float %92, %98 | |
%100 = or i32 %.086100, 8 | |
%101 = mul i32 %100, %6 | |
%102 = add i32 %38, %101 | |
%103 = sext i32 %102 to i64 | |
%104 = getelementptr inbounds float, float* %19, i64 %103 | |
%105 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %104, i32 4) #8 | |
%106 = fadd float %99, %105 | |
%107 = or i32 %.086100, 9 | |
%108 = mul i32 %107, %6 | |
%109 = add i32 %38, %108 | |
%110 = sext i32 %109 to i64 | |
%111 = getelementptr inbounds float, float* %19, i64 %110 | |
%112 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %111, i32 4) #8 | |
%113 = fadd float %106, %112 | |
%114 = or i32 %.086100, 10 | |
%115 = mul i32 %114, %6 | |
%116 = add i32 %38, %115 | |
%117 = sext i32 %116 to i64 | |
%118 = getelementptr inbounds float, float* %19, i64 %117 | |
%119 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %118, i32 4) #8 | |
%120 = fadd float %113, %119 | |
%121 = or i32 %.086100, 11 | |
%122 = mul i32 %121, %6 | |
%123 = add i32 %38, %122 | |
%124 = sext i32 %123 to i64 | |
%125 = getelementptr inbounds float, float* %19, i64 %124 | |
%126 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %125, i32 4) #8 | |
%127 = fadd float %120, %126 | |
%128 = or i32 %.086100, 12 | |
%129 = mul i32 %128, %6 | |
%130 = add i32 %38, %129 | |
%131 = sext i32 %130 to i64 | |
%132 = getelementptr inbounds float, float* %19, i64 %131 | |
%133 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %132, i32 4) #8 | |
%134 = fadd float %127, %133 | |
%135 = or i32 %.086100, 13 | |
%136 = mul i32 %135, %6 | |
%137 = add i32 %38, %136 | |
%138 = sext i32 %137 to i64 | |
%139 = getelementptr inbounds float, float* %19, i64 %138 | |
%140 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %139, i32 4) #8 | |
%141 = fadd float %134, %140 | |
%142 = or i32 %.086100, 14 | |
%143 = mul i32 %142, %6 | |
%144 = add i32 %38, %143 | |
%145 = sext i32 %144 to i64 | |
%146 = getelementptr inbounds float, float* %19, i64 %145 | |
%147 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %146, i32 4) #8 | |
%148 = fadd float %141, %147 | |
%149 = or i32 %.086100, 15 | |
%150 = mul i32 %149, %6 | |
%151 = add i32 %38, %150 | |
%152 = sext i32 %151 to i64 | |
%153 = getelementptr inbounds float, float* %19, i64 %152 | |
%154 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %153, i32 4) #8 | |
%155 = fadd float %148, %154 | |
%156 = icmp slt i32 %40, 128 | |
br i1 %156, label %39, label %.critedge.loopexit125 | |
; <label>:157: ; preds = %39 | |
%.lcssa = phi i32 [ %45, %39 ] | |
%.09299.lcssa = phi float [ %.09299, %39 ] | |
%158 = add i32 %.lcssa, %36 | |
%159 = icmp slt i32 %158, %2 | |
br i1 %159, label %.lr.ph.preheader, label %.critedge | |
.lr.ph.preheader: ; preds = %157 | |
br label %.lr.ph | |
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph | |
%.084102 = phi i32 [ %165, %.lr.ph ], [ %158, %.lr.ph.preheader ] | |
%.1101 = phi float [ %164, %.lr.ph ], [ %.09299.lcssa, %.lr.ph.preheader ] | |
%160 = add nsw i32 %.084102, %37 | |
%161 = sext i32 %160 to i64 | |
%162 = getelementptr inbounds float, float* %19, i64 %161 | |
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8 | |
%164 = fadd float %.1101, %163 | |
%165 = add i32 %.084102, %6 | |
%166 = icmp slt i32 %165, %2 | |
br i1 %166, label %.lr.ph, label %.critedge.loopexit | |
.critedge.loopexit: ; preds = %.lr.ph | |
%.lcssa134 = phi float [ %164, %.lr.ph ] | |
br label %.critedge | |
.critedge.loopexit125: ; preds = %.preheader.preheader | |
%.lcssa133 = phi float [ %155, %.preheader.preheader ] | |
br label %.critedge | |
.critedge: ; preds = %.critedge.loopexit125, %.critedge.loopexit, %157 | |
%.3 = phi float [ %.09299.lcssa, %157 ], [ %.lcssa134, %.critedge.loopexit ], [ %.lcssa133, %.critedge.loopexit125 ] | |
tail call void @llvm.cuda.syncthreads() | |
br label %168 | |
; <label>:167: ; preds = %168 | |
%.lcssa135 = phi float [ %170, %168 ] | |
br i1 %21, label %173, label %177 | |
; <label>:168: ; preds = %.critedge, %168 | |
%.0104 = phi i32 [ 16, %.critedge ], [ %171, %168 ] | |
%.4103 = phi float [ %.3, %.critedge ], [ %170, %168 ] | |
%169 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.4103, i32 %.0104, i32 31) #3, !srcloc !53 | |
%170 = fadd float %.4103, %169 | |
%171 = lshr i32 %.0104, 1 | |
%172 = icmp eq i32 %171, 0 | |
br i1 %172, label %167, label %168 | |
; <label>:173: ; preds = %167 | |
%174 = sext i32 %31 to i64 | |
%175 = getelementptr inbounds float, float* %4, i64 %174 | |
%176 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %175, float %.lcssa135) #8 | |
br label %177 | |
; <label>:177: ; preds = %167, %173, %30 | |
tail call void @llvm.cuda.syncthreads() | |
%178 = add i32 %.083105, %12 | |
%179 = icmp slt i32 %178, %11 | |
br i1 %179, label %30, label %._crit_edge.loopexit | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator"* byval align 8, i32, i32, float*) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%8 = mul nuw nsw i32 %7, %6 | |
%9 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%10 = mul nuw nsw i32 %9, %6 | |
%11 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%12 = add nuw nsw i32 %10, %11 | |
%13 = icmp eq i32 %7, 1 | |
br i1 %13, label %.preheader, label %19 | |
.preheader: ; preds = %5 | |
%14 = icmp slt i32 %12, %3 | |
br i1 %14, label %.lr.ph60.preheader, label %._crit_edge61 | |
.lr.ph60.preheader: ; preds = %.preheader | |
br label %.lr.ph60 | |
._crit_edge61.loopexit: ; preds = %.lr.ph60 | |
br label %._crit_edge61 | |
._crit_edge61: ; preds = %._crit_edge61.loopexit, %.preheader | |
tail call void @llvm.cuda.syncthreads() | |
br label %19 | |
.lr.ph60: ; preds = %.lr.ph60.preheader, %.lr.ph60 | |
%.059 = phi i32 [ %17, %.lr.ph60 ], [ %12, %.lr.ph60.preheader ] | |
%15 = sext i32 %.059 to i64 | |
%16 = getelementptr inbounds float, float* %4, i64 %15 | |
store float 0.000000e+00, float* %16, align 4 | |
%17 = add nsw i32 %.059, %8 | |
%18 = icmp slt i32 %17, %3 | |
br i1 %18, label %.lr.ph60, label %._crit_edge61.loopexit | |
; <label>:19: ; preds = %._crit_edge61, %5 | |
%20 = add i32 %2, 15 | |
%21 = sdiv i32 %20, 16 | |
%22 = mul nsw i32 %21, %3 | |
%23 = icmp slt i32 %12, %22 | |
br i1 %23, label %.lr.ph57, label %._crit_edge58 | |
.lr.ph57: ; preds = %19 | |
%24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %1, i64 0, i32 10, i32 0 | |
%25 = load float*, float** %24, align 8 | |
br label %26 | |
._crit_edge58.loopexit: ; preds = %._crit_edge | |
br label %._crit_edge58 | |
._crit_edge58: ; preds = %._crit_edge58.loopexit, %19 | |
ret void | |
; <label>:26: ; preds = %.lr.ph57, %._crit_edge | |
%.04755 = phi i32 [ %12, %.lr.ph57 ], [ %36, %._crit_edge ] | |
%27 = srem i32 %.04755, %3 | |
%28 = sdiv i32 %.04755, %3 | |
%29 = shl nsw i32 %28, 4 | |
%30 = add nsw i32 %29, 16 | |
%31 = icmp sgt i32 %30, %2 | |
%..i = select i1 %31, i32 %2, i32 %30 | |
%32 = icmp slt i32 %29, %..i | |
br i1 %32, label %.lr.ph.preheader, label %._crit_edge | |
.lr.ph.preheader: ; preds = %26 | |
br label %.lr.ph | |
._crit_edge.loopexit: ; preds = %.lr.ph | |
%.lcssa = phi float [ %43, %.lr.ph ] | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %26 | |
%.052.lcssa = phi float [ 0.000000e+00, %26 ], [ %.lcssa, %._crit_edge.loopexit ] | |
%33 = sext i32 %27 to i64 | |
%34 = getelementptr inbounds float, float* %4, i64 %33 | |
%35 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %34, float %.052.lcssa) #8 | |
%36 = add nsw i32 %.04755, %8 | |
%37 = icmp slt i32 %36, %22 | |
br i1 %37, label %26, label %._crit_edge58.loopexit | |
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph | |
%.04654 = phi i32 [ %44, %.lr.ph ], [ %29, %.lr.ph.preheader ] | |
%.05253 = phi float [ %43, %.lr.ph ], [ 0.000000e+00, %.lr.ph.preheader ] | |
%38 = mul nsw i32 %.04654, %3 | |
%39 = add nsw i32 %38, %27 | |
%40 = sext i32 %39 to i64 | |
%41 = getelementptr inbounds float, float* %25, i64 %40 | |
%42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8 | |
%43 = fadd float %.05253, %42 | |
%44 = add nsw i32 %.04654, 1 | |
%45 = icmp slt i32 %44, %..i | |
br i1 %45, label %.lr.ph, label %._crit_edge.loopexit | |
} | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.5"* byval align 8, i32) #0 comdat { | |
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%5 = mul nuw nsw i32 %4, %3 | |
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%7 = add nuw nsw i32 %5, %6 | |
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%9 = mul nuw nsw i32 %8, %4 | |
%.sroa.444.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 0, i32 7 | |
%.sroa.444.0..sroa_cast = bitcast i32* %.sroa.444.0..sroa_idx to i64* | |
%.sroa.444.0.copyload = load i64, i64* %.sroa.444.0..sroa_cast, align 8 | |
%.sroa.546.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 0, i32 9, i32 0, i64 0 | |
%.sroa.546.0.copyload = load i32, i32* %.sroa.546.0..sroa_idx, align 8 | |
%.sroa.750.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 0, i32 10, i32 0 | |
%.sroa.750.0.copyload = load float*, float** %.sroa.750.0..sroa_idx, align 8 | |
%.sroa.9.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.5", %"struct.Eigen::TensorEvaluator.5"* %0, i64 0, i32 2 | |
%.sroa.9.0.copyload = load float*, float** %.sroa.9.0..sroa_idx, align 8 | |
%10 = icmp slt i32 %7, %1 | |
br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit | |
.lr.ph.i: ; preds = %2 | |
%11 = trunc i64 %.sroa.444.0.copyload to i32 | |
%12 = icmp sgt i32 %.sroa.546.0.copyload, 0 | |
%13 = lshr i64 %.sroa.444.0.copyload, 32 | |
%14 = trunc i64 %13 to i32 | |
br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader | |
.lr.ph.split.i.preheader: ; preds = %.lr.ph.i | |
br label %.lr.ph.split.i | |
.lr.ph.split.us.i.preheader: ; preds = %.lr.ph.i | |
%15 = add i32 %.sroa.546.0.copyload, -1 | |
%xtraiter = and i32 %.sroa.546.0.copyload, 3 | |
%16 = icmp ult i32 %15, 3 | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
%unroll_iter = sub i32 %.sroa.546.0.copyload, %xtraiter | |
br label %.lr.ph.split.us.i | |
.lr.ph.split.us.i: ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
%.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ] | |
%17 = mul nsw i32 %.07.us.i, %11 | |
br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new | |
.lr.ph.split.us.i.new: ; preds = %.lr.ph.split.us.i | |
br label %18 | |
; <label>:18: ; preds = %18, %.lr.ph.split.us.i.new | |
%19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ] | |
%.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ] | |
%20 = mul nsw i32 %.012.i.i.i.us.i, %14 | |
%21 = add nsw i32 %20, %17 | |
%22 = sext i32 %21 to i64 | |
%23 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %22 | |
%24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8 | |
%25 = fadd float %19, %24 | |
%26 = or i32 %.012.i.i.i.us.i, 1 | |
%27 = mul nsw i32 %26, %14 | |
%28 = add nsw i32 %27, %17 | |
%29 = sext i32 %28 to i64 | |
%30 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %29 | |
%31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8 | |
%32 = fadd float %25, %31 | |
%33 = or i32 %.012.i.i.i.us.i, 2 | |
%34 = mul nsw i32 %33, %14 | |
%35 = add nsw i32 %34, %17 | |
%36 = sext i32 %35 to i64 | |
%37 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %36 | |
%38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8 | |
%39 = fadd float %32, %38 | |
%40 = or i32 %.012.i.i.i.us.i, 3 | |
%41 = mul nsw i32 %40, %14 | |
%42 = add nsw i32 %41, %17 | |
%43 = sext i32 %42 to i64 | |
%44 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %43 | |
%45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8 | |
%46 = fadd float %39, %45 | |
%47 = add nsw i32 %.012.i.i.i.us.i, 4 | |
%niter.nsub.3 = add i32 %niter, -4 | |
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 | |
br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18 | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18 | |
%.lcssa66 = phi i32 [ %47, %18 ] | |
%.lcssa65 = phi float [ %46, %18 ] | |
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i | |
%.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
%.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
%.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader | |
.epil.preheader: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa | |
br label %48 | |
; <label>:48: ; preds = %48, %.epil.preheader | |
%49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ] | |
%.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ] | |
%epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ] | |
%50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14 | |
%51 = add nsw i32 %50, %17 | |
%52 = sext i32 %51 to i64 | |
%53 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %52 | |
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8 | |
%55 = fadd float %49, %54 | |
%56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !56 | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48 | |
%.lcssa67 = phi float [ %55, %48 ] | |
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa | |
%.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ] | |
%57 = sext i32 %.07.us.i to i64 | |
%58 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %57 | |
store float %.lcssa, float* %58, align 4 | |
%59 = add nsw i32 %.07.us.i, %9 | |
%60 = icmp slt i32 %59, %1 | |
br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit | |
.lr.ph.split.i: ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i | |
%.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ] | |
%61 = sext i32 %.07.i to i64 | |
%62 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %61 | |
store float 0.000000e+00, float* %62, align 4 | |
%63 = add nsw i32 %.07.i, %9 | |
%64 = icmp slt i32 %63, %1 | |
br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63 | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63: ; preds = %.lr.ph.split.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit, %2 | |
ret void | |
} | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_(float, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #0 comdat { | |
%4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%6 = mul nuw nsw i32 %5, %4 | |
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%8 = add nuw nsw i32 %6, %7 | |
%9 = icmp slt i32 %8, %1 | |
br i1 %9, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %3 | |
%10 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %2, i64 0, i32 0 | |
%11 = load float*, float** %10, align 8 | |
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%13 = mul nuw nsw i32 %12, %5 | |
br label %14 | |
._crit_edge.loopexit: ; preds = %14 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %3 | |
ret void | |
; <label>:14: ; preds = %.lr.ph, %14 | |
%.08 = phi i32 [ %8, %.lr.ph ], [ %17, %14 ] | |
%15 = sext i32 %.08 to i64 | |
%16 = getelementptr inbounds float, float* %11, i64 %15 | |
store float %0, float* %16, align 4 | |
%17 = add i32 %13, %.08 | |
%18 = icmp slt i32 %17, %1 | |
br i1 %18, label %14, label %._crit_edge.loopexit | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 32 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %3, 32767 | |
%31 = sdiv i32 %30, 32768 | |
%32 = mul nsw i32 %31, %2 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = icmp slt i32 %33, %32 | |
br i1 %35, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%36 = and i32 %34, 31 | |
%.not = icmp ne i32 %36, 0 | |
%37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0 | |
%38 = load float*, float** %37, align 8 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0 | |
%40 = load float*, float** %39, align 8 | |
br label %41 | |
._crit_edge.loopexit: ; preds = %187 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %29 | |
ret void | |
; <label>:41: ; preds = %.lr.ph, %187 | |
%.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ] | |
%42 = srem i32 %.0114, %31 | |
%43 = sdiv i32 %.0114, %31 | |
%44 = shl nsw i32 %42, 15 | |
%45 = or i32 %44, %34 | |
%46 = icmp slt i32 %43, %2 | |
br i1 %46, label %.preheader102, label %.thread.preheader | |
.thread.preheader.loopexit: ; preds = %.preheader.preheader | |
%.lcssa137 = phi float [ %164, %.preheader.preheader ] | |
br label %.thread.preheader | |
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41 | |
%.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ] | |
br label %.thread | |
.preheader102: ; preds = %41 | |
%47 = mul nsw i32 %43, %3 | |
%48 = add i32 %47, %45 | |
br label %49 | |
; <label>:49: ; preds = %.preheader102, %.preheader.preheader | |
%.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ] | |
%.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ] | |
%50 = add nuw nsw i32 %.098108, 16 | |
%51 = shl i32 %.098108, 8 | |
%52 = or i32 %51, 3840 | |
%53 = add nsw i32 %52, %45 | |
%54 = icmp slt i32 %53, %3 | |
br i1 %54, label %.preheader.preheader, label %.preheader101 | |
.preheader.preheader: ; preds = %49 | |
%55 = add i32 %48, %51 | |
%56 = sext i32 %55 to i64 | |
%57 = getelementptr inbounds float, float* %40, i64 %56 | |
%58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8 | |
%59 = fadd float %.095109, %58 | |
%60 = shl i32 %.098108, 8 | |
%61 = or i32 %60, 256 | |
%62 = add i32 %48, %61 | |
%63 = sext i32 %62 to i64 | |
%64 = getelementptr inbounds float, float* %40, i64 %63 | |
%65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8 | |
%66 = fadd float %59, %65 | |
%67 = shl i32 %.098108, 8 | |
%68 = or i32 %67, 512 | |
%69 = add i32 %48, %68 | |
%70 = sext i32 %69 to i64 | |
%71 = getelementptr inbounds float, float* %40, i64 %70 | |
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8 | |
%73 = fadd float %66, %72 | |
%74 = shl i32 %.098108, 8 | |
%75 = or i32 %74, 768 | |
%76 = add i32 %48, %75 | |
%77 = sext i32 %76 to i64 | |
%78 = getelementptr inbounds float, float* %40, i64 %77 | |
%79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8 | |
%80 = fadd float %73, %79 | |
%81 = shl i32 %.098108, 8 | |
%82 = or i32 %81, 1024 | |
%83 = add i32 %48, %82 | |
%84 = sext i32 %83 to i64 | |
%85 = getelementptr inbounds float, float* %40, i64 %84 | |
%86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8 | |
%87 = fadd float %80, %86 | |
%88 = shl i32 %.098108, 8 | |
%89 = or i32 %88, 1280 | |
%90 = add i32 %48, %89 | |
%91 = sext i32 %90 to i64 | |
%92 = getelementptr inbounds float, float* %40, i64 %91 | |
%93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8 | |
%94 = fadd float %87, %93 | |
%95 = shl i32 %.098108, 8 | |
%96 = or i32 %95, 1536 | |
%97 = add i32 %48, %96 | |
%98 = sext i32 %97 to i64 | |
%99 = getelementptr inbounds float, float* %40, i64 %98 | |
%100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8 | |
%101 = fadd float %94, %100 | |
%102 = shl i32 %.098108, 8 | |
%103 = or i32 %102, 1792 | |
%104 = add i32 %48, %103 | |
%105 = sext i32 %104 to i64 | |
%106 = getelementptr inbounds float, float* %40, i64 %105 | |
%107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8 | |
%108 = fadd float %101, %107 | |
%109 = shl i32 %.098108, 8 | |
%110 = or i32 %109, 2048 | |
%111 = add i32 %48, %110 | |
%112 = sext i32 %111 to i64 | |
%113 = getelementptr inbounds float, float* %40, i64 %112 | |
%114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8 | |
%115 = fadd float %108, %114 | |
%116 = shl i32 %.098108, 8 | |
%117 = or i32 %116, 2304 | |
%118 = add i32 %48, %117 | |
%119 = sext i32 %118 to i64 | |
%120 = getelementptr inbounds float, float* %40, i64 %119 | |
%121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8 | |
%122 = fadd float %115, %121 | |
%123 = shl i32 %.098108, 8 | |
%124 = or i32 %123, 2560 | |
%125 = add i32 %48, %124 | |
%126 = sext i32 %125 to i64 | |
%127 = getelementptr inbounds float, float* %40, i64 %126 | |
%128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8 | |
%129 = fadd float %122, %128 | |
%130 = shl i32 %.098108, 8 | |
%131 = or i32 %130, 2816 | |
%132 = add i32 %48, %131 | |
%133 = sext i32 %132 to i64 | |
%134 = getelementptr inbounds float, float* %40, i64 %133 | |
%135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8 | |
%136 = fadd float %129, %135 | |
%137 = shl i32 %.098108, 8 | |
%138 = or i32 %137, 3072 | |
%139 = add i32 %48, %138 | |
%140 = sext i32 %139 to i64 | |
%141 = getelementptr inbounds float, float* %40, i64 %140 | |
%142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8 | |
%143 = fadd float %136, %142 | |
%144 = shl i32 %.098108, 8 | |
%145 = or i32 %144, 3328 | |
%146 = add i32 %48, %145 | |
%147 = sext i32 %146 to i64 | |
%148 = getelementptr inbounds float, float* %40, i64 %147 | |
%149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8 | |
%150 = fadd float %143, %149 | |
%151 = shl i32 %.098108, 8 | |
%152 = or i32 %151, 3584 | |
%153 = add i32 %48, %152 | |
%154 = sext i32 %153 to i64 | |
%155 = getelementptr inbounds float, float* %40, i64 %154 | |
%156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8 | |
%157 = fadd float %150, %156 | |
%158 = shl i32 %.098108, 8 | |
%159 = or i32 %158, 3840 | |
%160 = add i32 %48, %159 | |
%161 = sext i32 %160 to i64 | |
%162 = getelementptr inbounds float, float* %40, i64 %161 | |
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8 | |
%164 = fadd float %157, %163 | |
%165 = icmp slt i32 %50, 128 | |
br i1 %165, label %49, label %.thread.preheader.loopexit | |
.preheader101: ; preds = %49 | |
%.lcssa = phi i32 [ %51, %49 ] | |
%.098108.lcssa = phi i32 [ %.098108, %49 ] | |
%.095109.lcssa = phi float [ %.095109, %49 ] | |
%166 = add nsw i32 %.lcssa, %45 | |
%167 = icmp slt i32 %166, %3 | |
br i1 %167, label %168, label %.thread.preheader | |
; <label>:168: ; preds = %.preheader101 | |
%169 = add nsw i32 %166, %47 | |
%170 = sext i32 %169 to i64 | |
%171 = getelementptr inbounds float, float* %40, i64 %170 | |
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8 | |
%173 = fadd float %.095109.lcssa, %172 | |
%174 = shl i32 %.098108.lcssa, 8 | |
%175 = or i32 %174, 256 | |
%176 = add nsw i32 %175, %45 | |
%177 = icmp slt i32 %176, %3 | |
br i1 %177, label %190, label %.thread.preheader | |
; <label>:178: ; preds = %.thread | |
%.lcssa138 = phi float [ %180, %.thread ] | |
%.not99 = xor i1 %46, true | |
%brmerge = or i1 %.not, %.not99 | |
br i1 %brmerge, label %187, label %183 | |
.thread: ; preds = %.thread.preheader, %.thread | |
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ] | |
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ] | |
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53 | |
%180 = fadd float %.8112, %179 | |
%181 = lshr i32 %.092113, 1 | |
%182 = icmp eq i32 %181, 0 | |
br i1 %182, label %178, label %.thread, !llvm.loop !58 | |
; <label>:183: ; preds = %178 | |
%184 = sext i32 %43 to i64 | |
%185 = getelementptr inbounds float, float* %38, i64 %184 | |
%186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8 | |
br label %187 | |
; <label>:187: ; preds = %178, %183 | |
%188 = add nuw nsw i32 %.0114, 32 | |
%189 = icmp slt i32 %188, %32 | |
br i1 %189, label %41, label %._crit_edge.loopexit | |
; <label>:190: ; preds = %168 | |
%191 = add nsw i32 %176, %47 | |
%192 = sext i32 %191 to i64 | |
%193 = getelementptr inbounds float, float* %40, i64 %192 | |
%194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8 | |
%195 = fadd float %173, %194 | |
%196 = shl i32 %.098108.lcssa, 8 | |
%197 = or i32 %196, 512 | |
%198 = add nsw i32 %197, %45 | |
%199 = icmp slt i32 %198, %3 | |
br i1 %199, label %200, label %.thread.preheader | |
; <label>:200: ; preds = %190 | |
%201 = add nsw i32 %198, %47 | |
%202 = sext i32 %201 to i64 | |
%203 = getelementptr inbounds float, float* %40, i64 %202 | |
%204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8 | |
%205 = fadd float %195, %204 | |
%206 = shl i32 %.098108.lcssa, 8 | |
%207 = or i32 %206, 768 | |
%208 = add nsw i32 %207, %45 | |
%209 = icmp slt i32 %208, %3 | |
br i1 %209, label %210, label %.thread.preheader | |
; <label>:210: ; preds = %200 | |
%211 = add nsw i32 %208, %47 | |
%212 = sext i32 %211 to i64 | |
%213 = getelementptr inbounds float, float* %40, i64 %212 | |
%214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8 | |
%215 = fadd float %205, %214 | |
%216 = shl i32 %.098108.lcssa, 8 | |
%217 = or i32 %216, 1024 | |
%218 = add nsw i32 %217, %45 | |
%219 = icmp slt i32 %218, %3 | |
br i1 %219, label %220, label %.thread.preheader | |
; <label>:220: ; preds = %210 | |
%221 = add nsw i32 %218, %47 | |
%222 = sext i32 %221 to i64 | |
%223 = getelementptr inbounds float, float* %40, i64 %222 | |
%224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8 | |
%225 = fadd float %215, %224 | |
%226 = shl i32 %.098108.lcssa, 8 | |
%227 = or i32 %226, 1280 | |
%228 = add nsw i32 %227, %45 | |
%229 = icmp slt i32 %228, %3 | |
br i1 %229, label %230, label %.thread.preheader | |
; <label>:230: ; preds = %220 | |
%231 = add nsw i32 %228, %47 | |
%232 = sext i32 %231 to i64 | |
%233 = getelementptr inbounds float, float* %40, i64 %232 | |
%234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8 | |
%235 = fadd float %225, %234 | |
%236 = shl i32 %.098108.lcssa, 8 | |
%237 = or i32 %236, 1536 | |
%238 = add nsw i32 %237, %45 | |
%239 = icmp slt i32 %238, %3 | |
br i1 %239, label %240, label %.thread.preheader | |
; <label>:240: ; preds = %230 | |
%241 = add nsw i32 %238, %47 | |
%242 = sext i32 %241 to i64 | |
%243 = getelementptr inbounds float, float* %40, i64 %242 | |
%244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8 | |
%245 = fadd float %235, %244 | |
%246 = shl i32 %.098108.lcssa, 8 | |
%247 = or i32 %246, 1792 | |
%248 = add nsw i32 %247, %45 | |
%249 = icmp slt i32 %248, %3 | |
br i1 %249, label %250, label %.thread.preheader | |
; <label>:250: ; preds = %240 | |
%251 = add nsw i32 %248, %47 | |
%252 = sext i32 %251 to i64 | |
%253 = getelementptr inbounds float, float* %40, i64 %252 | |
%254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8 | |
%255 = fadd float %245, %254 | |
%256 = shl i32 %.098108.lcssa, 8 | |
%257 = or i32 %256, 2048 | |
%258 = add nsw i32 %257, %45 | |
%259 = icmp slt i32 %258, %3 | |
br i1 %259, label %260, label %.thread.preheader | |
; <label>:260: ; preds = %250 | |
%261 = add nsw i32 %258, %47 | |
%262 = sext i32 %261 to i64 | |
%263 = getelementptr inbounds float, float* %40, i64 %262 | |
%264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8 | |
%265 = fadd float %255, %264 | |
%266 = shl i32 %.098108.lcssa, 8 | |
%267 = or i32 %266, 2304 | |
%268 = add nsw i32 %267, %45 | |
%269 = icmp slt i32 %268, %3 | |
br i1 %269, label %270, label %.thread.preheader | |
; <label>:270: ; preds = %260 | |
%271 = add nsw i32 %268, %47 | |
%272 = sext i32 %271 to i64 | |
%273 = getelementptr inbounds float, float* %40, i64 %272 | |
%274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8 | |
%275 = fadd float %265, %274 | |
%276 = shl i32 %.098108.lcssa, 8 | |
%277 = or i32 %276, 2560 | |
%278 = add nsw i32 %277, %45 | |
%279 = icmp slt i32 %278, %3 | |
br i1 %279, label %280, label %.thread.preheader | |
; <label>:280: ; preds = %270 | |
%281 = add nsw i32 %278, %47 | |
%282 = sext i32 %281 to i64 | |
%283 = getelementptr inbounds float, float* %40, i64 %282 | |
%284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8 | |
%285 = fadd float %275, %284 | |
%286 = shl i32 %.098108.lcssa, 8 | |
%287 = or i32 %286, 2816 | |
%288 = add nsw i32 %287, %45 | |
%289 = icmp slt i32 %288, %3 | |
br i1 %289, label %290, label %.thread.preheader | |
; <label>:290: ; preds = %280 | |
%291 = add nsw i32 %288, %47 | |
%292 = sext i32 %291 to i64 | |
%293 = getelementptr inbounds float, float* %40, i64 %292 | |
%294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8 | |
%295 = fadd float %285, %294 | |
%296 = shl i32 %.098108.lcssa, 8 | |
%297 = or i32 %296, 3072 | |
%298 = add nsw i32 %297, %45 | |
%299 = icmp slt i32 %298, %3 | |
br i1 %299, label %300, label %.thread.preheader | |
; <label>:300: ; preds = %290 | |
%301 = add nsw i32 %298, %47 | |
%302 = sext i32 %301 to i64 | |
%303 = getelementptr inbounds float, float* %40, i64 %302 | |
%304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8 | |
%305 = fadd float %295, %304 | |
%306 = shl i32 %.098108.lcssa, 8 | |
%307 = or i32 %306, 3328 | |
%308 = add nsw i32 %307, %45 | |
%309 = icmp slt i32 %308, %3 | |
br i1 %309, label %310, label %.thread.preheader | |
; <label>:310: ; preds = %300 | |
%311 = add nsw i32 %308, %47 | |
%312 = sext i32 %311 to i64 | |
%313 = getelementptr inbounds float, float* %40, i64 %312 | |
%314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8 | |
%315 = fadd float %305, %314 | |
%316 = shl i32 %.098108.lcssa, 8 | |
%317 = or i32 %316, 3584 | |
%318 = add nsw i32 %317, %45 | |
%319 = icmp slt i32 %318, %3 | |
br i1 %319, label %320, label %.thread.preheader | |
; <label>:320: ; preds = %310 | |
%321 = add nsw i32 %318, %47 | |
%322 = sext i32 %321 to i64 | |
%323 = getelementptr inbounds float, float* %40, i64 %322 | |
%324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8 | |
%325 = fadd float %315, %324 | |
br label %.thread.preheader | |
} | |
; Function Attrs: convergent inlinehint noreturn nounwind | |
define internal fastcc void @_ZL13__assert_failPKcS0_jS0_(i8*, i32, i8*) unnamed_addr #6 { | |
tail call void @__assertfail(i8* %0, i8* getelementptr inbounds ([76 x i8], [76 x i8]* @.str.1, i64 0, i64 0), i32 %1, i8* %2, i64 1) #10 | |
unreachable | |
} | |
; Function Attrs: convergent noreturn | |
declare void @__assertfail(i8*, i8*, i32, i8*, i64) #7 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.ntid.y() #1 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.ntid.z() #1 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.nctaid.y() #1 | |
; Function Attrs: nounwind readnone | |
declare i32 @llvm.ptx.read.nctaid.z() #1 | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 32 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %3, 32767 | |
%31 = sdiv i32 %30, 32768 | |
%32 = mul nsw i32 %31, %2 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = icmp slt i32 %33, %32 | |
br i1 %35, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0 | |
%36 = and i32 %34, 31 | |
%.not = icmp ne i32 %36, 0 | |
%37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0 | |
%38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0 | |
br label %39 | |
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %29 | |
ret void | |
; <label>:39: ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
%.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ] | |
%40 = srem i32 %.0114, %31 | |
%41 = sdiv i32 %.0114, %31 | |
%42 = shl nsw i32 %40, 15 | |
%43 = or i32 %42, %34 | |
%.idx.val = load float, float* %.idx, align 4 | |
%44 = icmp slt i32 %41, %2 | |
br i1 %44, label %.preheader102, label %.thread.preheader | |
.thread.preheader.loopexit: ; preds = %.preheader.preheader | |
%.lcssa137 = phi float [ %163, %.preheader.preheader ] | |
br label %.thread.preheader | |
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39 | |
%.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ] | |
br label %.thread | |
.preheader102: ; preds = %39 | |
%45 = mul nsw i32 %41, %3 | |
%46 = add i32 %45, %43 | |
%47 = load float*, float** %38, align 8 | |
br label %48 | |
; <label>:48: ; preds = %.preheader102, %.preheader.preheader | |
%.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ] | |
%.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ] | |
%49 = add nuw nsw i32 %.098108, 16 | |
%50 = shl i32 %.098108, 8 | |
%51 = or i32 %50, 3840 | |
%52 = add nsw i32 %51, %43 | |
%53 = icmp slt i32 %52, %3 | |
br i1 %53, label %.preheader.preheader, label %.preheader101 | |
.preheader.preheader: ; preds = %48 | |
%54 = add i32 %46, %50 | |
%55 = sext i32 %54 to i64 | |
%56 = getelementptr inbounds float, float* %47, i64 %55 | |
%57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8 | |
%58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8 | |
%59 = shl i32 %.098108, 8 | |
%60 = or i32 %59, 256 | |
%61 = add i32 %46, %60 | |
%62 = sext i32 %61 to i64 | |
%63 = getelementptr inbounds float, float* %47, i64 %62 | |
%64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8 | |
%65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8 | |
%66 = shl i32 %.098108, 8 | |
%67 = or i32 %66, 512 | |
%68 = add i32 %46, %67 | |
%69 = sext i32 %68 to i64 | |
%70 = getelementptr inbounds float, float* %47, i64 %69 | |
%71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8 | |
%72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8 | |
%73 = shl i32 %.098108, 8 | |
%74 = or i32 %73, 768 | |
%75 = add i32 %46, %74 | |
%76 = sext i32 %75 to i64 | |
%77 = getelementptr inbounds float, float* %47, i64 %76 | |
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8 | |
%79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8 | |
%80 = shl i32 %.098108, 8 | |
%81 = or i32 %80, 1024 | |
%82 = add i32 %46, %81 | |
%83 = sext i32 %82 to i64 | |
%84 = getelementptr inbounds float, float* %47, i64 %83 | |
%85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8 | |
%86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8 | |
%87 = shl i32 %.098108, 8 | |
%88 = or i32 %87, 1280 | |
%89 = add i32 %46, %88 | |
%90 = sext i32 %89 to i64 | |
%91 = getelementptr inbounds float, float* %47, i64 %90 | |
%92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8 | |
%93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8 | |
%94 = shl i32 %.098108, 8 | |
%95 = or i32 %94, 1536 | |
%96 = add i32 %46, %95 | |
%97 = sext i32 %96 to i64 | |
%98 = getelementptr inbounds float, float* %47, i64 %97 | |
%99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8 | |
%100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8 | |
%101 = shl i32 %.098108, 8 | |
%102 = or i32 %101, 1792 | |
%103 = add i32 %46, %102 | |
%104 = sext i32 %103 to i64 | |
%105 = getelementptr inbounds float, float* %47, i64 %104 | |
%106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8 | |
%107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8 | |
%108 = shl i32 %.098108, 8 | |
%109 = or i32 %108, 2048 | |
%110 = add i32 %46, %109 | |
%111 = sext i32 %110 to i64 | |
%112 = getelementptr inbounds float, float* %47, i64 %111 | |
%113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8 | |
%114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8 | |
%115 = shl i32 %.098108, 8 | |
%116 = or i32 %115, 2304 | |
%117 = add i32 %46, %116 | |
%118 = sext i32 %117 to i64 | |
%119 = getelementptr inbounds float, float* %47, i64 %118 | |
%120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8 | |
%121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8 | |
%122 = shl i32 %.098108, 8 | |
%123 = or i32 %122, 2560 | |
%124 = add i32 %46, %123 | |
%125 = sext i32 %124 to i64 | |
%126 = getelementptr inbounds float, float* %47, i64 %125 | |
%127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8 | |
%128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8 | |
%129 = shl i32 %.098108, 8 | |
%130 = or i32 %129, 2816 | |
%131 = add i32 %46, %130 | |
%132 = sext i32 %131 to i64 | |
%133 = getelementptr inbounds float, float* %47, i64 %132 | |
%134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8 | |
%135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8 | |
%136 = shl i32 %.098108, 8 | |
%137 = or i32 %136, 3072 | |
%138 = add i32 %46, %137 | |
%139 = sext i32 %138 to i64 | |
%140 = getelementptr inbounds float, float* %47, i64 %139 | |
%141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8 | |
%142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8 | |
%143 = shl i32 %.098108, 8 | |
%144 = or i32 %143, 3328 | |
%145 = add i32 %46, %144 | |
%146 = sext i32 %145 to i64 | |
%147 = getelementptr inbounds float, float* %47, i64 %146 | |
%148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8 | |
%149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8 | |
%150 = shl i32 %.098108, 8 | |
%151 = or i32 %150, 3584 | |
%152 = add i32 %46, %151 | |
%153 = sext i32 %152 to i64 | |
%154 = getelementptr inbounds float, float* %47, i64 %153 | |
%155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8 | |
%156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8 | |
%157 = shl i32 %.098108, 8 | |
%158 = or i32 %157, 3840 | |
%159 = add i32 %46, %158 | |
%160 = sext i32 %159 to i64 | |
%161 = getelementptr inbounds float, float* %47, i64 %160 | |
%162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8 | |
%163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8 | |
%164 = icmp slt i32 %49, 128 | |
br i1 %164, label %48, label %.thread.preheader.loopexit | |
.preheader101: ; preds = %48 | |
%.lcssa = phi i32 [ %50, %48 ] | |
%.098108.lcssa = phi i32 [ %.098108, %48 ] | |
%.095109.lcssa = phi float [ %.095109, %48 ] | |
%165 = load float*, float** %38, align 8 | |
%166 = add nsw i32 %.lcssa, %43 | |
%167 = icmp slt i32 %166, %3 | |
br i1 %167, label %168, label %.thread.preheader | |
; <label>:168: ; preds = %.preheader101 | |
%169 = add nsw i32 %166, %45 | |
%170 = sext i32 %169 to i64 | |
%171 = getelementptr inbounds float, float* %165, i64 %170 | |
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8 | |
%173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8 | |
%174 = shl i32 %.098108.lcssa, 8 | |
%175 = or i32 %174, 256 | |
%176 = add nsw i32 %175, %43 | |
%177 = icmp slt i32 %176, %3 | |
br i1 %177, label %198, label %.thread.preheader | |
; <label>:178: ; preds = %.thread | |
%.lcssa138 = phi float [ %180, %.thread ] | |
%.not99 = xor i1 %44, true | |
%brmerge = or i1 %.not, %.not99 | |
br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183 | |
.thread: ; preds = %.thread.preheader, %.thread | |
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ] | |
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ] | |
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53 | |
%180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8 | |
%181 = lshr i32 %.092113, 1 | |
%182 = icmp eq i32 %181, 0 | |
br i1 %182, label %178, label %.thread, !llvm.loop !59 | |
; <label>:183: ; preds = %178 | |
%184 = load float*, float** %37, align 8 | |
%185 = sext i32 %41 to i64 | |
%186 = getelementptr inbounds float, float* %184, i64 %185 | |
%187 = bitcast float %.lcssa138 to i32 | |
%188 = bitcast float* %186 to i32* | |
%189 = load i32, i32* %188, align 4 | |
br label %190 | |
; <label>:190: ; preds = %193, %183 | |
%.011.i = phi i32 [ %189, %183 ], [ %195, %193 ] | |
%191 = bitcast i32 %.011.i to float | |
%192 = fcmp olt float %191, %.lcssa138 | |
br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit | |
; <label>:193: ; preds = %190 | |
%194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst | |
%195 = extractvalue { i32, i1 } %194, 0 | |
%not..i = icmp eq i32 %.011.i, %195 | |
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193 | |
br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178 | |
%196 = add nuw nsw i32 %.0114, 32 | |
%197 = icmp slt i32 %196, %32 | |
br i1 %197, label %39, label %._crit_edge.loopexit | |
; <label>:198: ; preds = %168 | |
%199 = add nsw i32 %176, %45 | |
%200 = sext i32 %199 to i64 | |
%201 = getelementptr inbounds float, float* %165, i64 %200 | |
%202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8 | |
%203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8 | |
%204 = shl i32 %.098108.lcssa, 8 | |
%205 = or i32 %204, 512 | |
%206 = add nsw i32 %205, %43 | |
%207 = icmp slt i32 %206, %3 | |
br i1 %207, label %208, label %.thread.preheader | |
; <label>:208: ; preds = %198 | |
%209 = add nsw i32 %206, %45 | |
%210 = sext i32 %209 to i64 | |
%211 = getelementptr inbounds float, float* %165, i64 %210 | |
%212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8 | |
%213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8 | |
%214 = shl i32 %.098108.lcssa, 8 | |
%215 = or i32 %214, 768 | |
%216 = add nsw i32 %215, %43 | |
%217 = icmp slt i32 %216, %3 | |
br i1 %217, label %218, label %.thread.preheader | |
; <label>:218: ; preds = %208 | |
%219 = add nsw i32 %216, %45 | |
%220 = sext i32 %219 to i64 | |
%221 = getelementptr inbounds float, float* %165, i64 %220 | |
%222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8 | |
%223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8 | |
%224 = shl i32 %.098108.lcssa, 8 | |
%225 = or i32 %224, 1024 | |
%226 = add nsw i32 %225, %43 | |
%227 = icmp slt i32 %226, %3 | |
br i1 %227, label %228, label %.thread.preheader | |
; <label>:228: ; preds = %218 | |
%229 = add nsw i32 %226, %45 | |
%230 = sext i32 %229 to i64 | |
%231 = getelementptr inbounds float, float* %165, i64 %230 | |
%232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8 | |
%233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8 | |
%234 = shl i32 %.098108.lcssa, 8 | |
%235 = or i32 %234, 1280 | |
%236 = add nsw i32 %235, %43 | |
%237 = icmp slt i32 %236, %3 | |
br i1 %237, label %238, label %.thread.preheader | |
; <label>:238: ; preds = %228 | |
%239 = add nsw i32 %236, %45 | |
%240 = sext i32 %239 to i64 | |
%241 = getelementptr inbounds float, float* %165, i64 %240 | |
%242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8 | |
%243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8 | |
%244 = shl i32 %.098108.lcssa, 8 | |
%245 = or i32 %244, 1536 | |
%246 = add nsw i32 %245, %43 | |
%247 = icmp slt i32 %246, %3 | |
br i1 %247, label %248, label %.thread.preheader | |
; <label>:248: ; preds = %238 | |
%249 = add nsw i32 %246, %45 | |
%250 = sext i32 %249 to i64 | |
%251 = getelementptr inbounds float, float* %165, i64 %250 | |
%252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8 | |
%253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8 | |
%254 = shl i32 %.098108.lcssa, 8 | |
%255 = or i32 %254, 1792 | |
%256 = add nsw i32 %255, %43 | |
%257 = icmp slt i32 %256, %3 | |
br i1 %257, label %258, label %.thread.preheader | |
; <label>:258: ; preds = %248 | |
%259 = add nsw i32 %256, %45 | |
%260 = sext i32 %259 to i64 | |
%261 = getelementptr inbounds float, float* %165, i64 %260 | |
%262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8 | |
%263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8 | |
%264 = shl i32 %.098108.lcssa, 8 | |
%265 = or i32 %264, 2048 | |
%266 = add nsw i32 %265, %43 | |
%267 = icmp slt i32 %266, %3 | |
br i1 %267, label %268, label %.thread.preheader | |
; <label>:268: ; preds = %258 | |
%269 = add nsw i32 %266, %45 | |
%270 = sext i32 %269 to i64 | |
%271 = getelementptr inbounds float, float* %165, i64 %270 | |
%272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8 | |
%273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8 | |
%274 = shl i32 %.098108.lcssa, 8 | |
%275 = or i32 %274, 2304 | |
%276 = add nsw i32 %275, %43 | |
%277 = icmp slt i32 %276, %3 | |
br i1 %277, label %278, label %.thread.preheader | |
; <label>:278: ; preds = %268 | |
%279 = add nsw i32 %276, %45 | |
%280 = sext i32 %279 to i64 | |
%281 = getelementptr inbounds float, float* %165, i64 %280 | |
%282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8 | |
%283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8 | |
%284 = shl i32 %.098108.lcssa, 8 | |
%285 = or i32 %284, 2560 | |
%286 = add nsw i32 %285, %43 | |
%287 = icmp slt i32 %286, %3 | |
br i1 %287, label %288, label %.thread.preheader | |
; <label>:288: ; preds = %278 | |
%289 = add nsw i32 %286, %45 | |
%290 = sext i32 %289 to i64 | |
%291 = getelementptr inbounds float, float* %165, i64 %290 | |
%292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8 | |
%293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8 | |
%294 = shl i32 %.098108.lcssa, 8 | |
%295 = or i32 %294, 2816 | |
%296 = add nsw i32 %295, %43 | |
%297 = icmp slt i32 %296, %3 | |
br i1 %297, label %298, label %.thread.preheader | |
; <label>:298: ; preds = %288 | |
%299 = add nsw i32 %296, %45 | |
%300 = sext i32 %299 to i64 | |
%301 = getelementptr inbounds float, float* %165, i64 %300 | |
%302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8 | |
%303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8 | |
%304 = shl i32 %.098108.lcssa, 8 | |
%305 = or i32 %304, 3072 | |
%306 = add nsw i32 %305, %43 | |
%307 = icmp slt i32 %306, %3 | |
br i1 %307, label %308, label %.thread.preheader | |
; <label>:308: ; preds = %298 | |
%309 = add nsw i32 %306, %45 | |
%310 = sext i32 %309 to i64 | |
%311 = getelementptr inbounds float, float* %165, i64 %310 | |
%312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8 | |
%313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8 | |
%314 = shl i32 %.098108.lcssa, 8 | |
%315 = or i32 %314, 3328 | |
%316 = add nsw i32 %315, %43 | |
%317 = icmp slt i32 %316, %3 | |
br i1 %317, label %318, label %.thread.preheader | |
; <label>:318: ; preds = %308 | |
%319 = add nsw i32 %316, %45 | |
%320 = sext i32 %319 to i64 | |
%321 = getelementptr inbounds float, float* %165, i64 %320 | |
%322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8 | |
%323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8 | |
%324 = shl i32 %.098108.lcssa, 8 | |
%325 = or i32 %324, 3584 | |
%326 = add nsw i32 %325, %43 | |
%327 = icmp slt i32 %326, %3 | |
br i1 %327, label %328, label %.thread.preheader | |
; <label>:328: ; preds = %318 | |
%329 = add nsw i32 %326, %45 | |
%330 = sext i32 %329 to i64 | |
%331 = getelementptr inbounds float, float* %165, i64 %330 | |
%332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8 | |
%333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8 | |
br label %.thread.preheader | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 128 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %2, 15 | |
%31 = sdiv i32 %30, 16 | |
%32 = mul nsw i32 %31, %3 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = shl nuw nsw i32 %33, 8 | |
%36 = add nuw nsw i32 %35, %34 | |
%37 = icmp slt i32 %36, %32 | |
br i1 %37, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%38 = icmp sgt i32 %3, -1 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0 | |
%40 = load float*, float** %39, align 8 | |
%41 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0 | |
%42 = load float*, float** %41, align 8 | |
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader | |
.lr.ph.split.preheader: ; preds = %.lr.ph | |
%43 = add i32 %32, -1 | |
%44 = sub i32 %43, %34 | |
%45 = sub i32 %44, %35 | |
%46 = lshr i32 %45, 15 | |
%47 = add nuw nsw i32 %46, 1 | |
%xtraiter = and i32 %47, 3 | |
%48 = icmp ult i32 %45, 98304 | |
br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new | |
.lr.ph.split.preheader.new: ; preds = %.lr.ph.split.preheader | |
%unroll_iter = sub nsw i32 %47, %xtraiter | |
br label %.lr.ph.split | |
.lr.ph.split.us.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split.us | |
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us | |
%.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ] | |
%49 = srem i32 %.047.us, %3 | |
%50 = sdiv i32 %.047.us, %3 | |
%51 = srem i32 %50, %31 | |
%52 = shl nsw i32 %51, 4 | |
br label %53 | |
; <label>:53: ; preds = %104, %.lr.ph.split.us | |
%.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ] | |
%.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ] | |
%54 = add nuw nsw i32 %.04346.us.us, %52 | |
%55 = icmp slt i32 %54, %2 | |
br i1 %55, label %56, label %62 | |
; <label>:56: ; preds = %53 | |
%57 = mul nsw i32 %54, %3 | |
%58 = add nsw i32 %57, %49 | |
%59 = sext i32 %58 to i64 | |
%60 = getelementptr inbounds float, float* %40, i64 %59 | |
%61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8 | |
br label %62 | |
; <label>:62: ; preds = %56, %53 | |
%63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ] | |
%64 = fadd float %.04445.us.us, %63 | |
%65 = or i32 %.04346.us.us, 1 | |
%66 = add nuw nsw i32 %65, %52 | |
%67 = icmp slt i32 %66, %2 | |
br i1 %67, label %98, label %104 | |
.us-lcssa.us.us: ; preds = %104 | |
%.lcssa = phi float [ %106, %104 ] | |
%68 = sext i32 %49 to i64 | |
%69 = getelementptr inbounds float, float* %42, i64 %68 | |
%70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8 | |
%71 = add nuw nsw i32 %.047.us, 32768 | |
%72 = icmp slt i32 %71, %32 | |
br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit | |
._crit_edge.loopexit: ; preds = %.us-lcssa.us.us | |
br label %._crit_edge | |
._crit_edge.loopexit59.unr-lcssa.loopexit: ; preds = %.lr.ph.split | |
%.lcssa60 = phi i32 [ %97, %.lr.ph.split ] | |
br label %._crit_edge.loopexit59.unr-lcssa | |
._crit_edge.loopexit59.unr-lcssa: ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader | |
%.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ] | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader | |
.lr.ph.split.epil.preheader: ; preds = %._crit_edge.loopexit59.unr-lcssa | |
br label %.lr.ph.split.epil | |
.lr.ph.split.epil: ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader | |
%.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ] | |
%epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ] | |
%73 = srem i32 %.047.epil, %3 | |
%74 = sext i32 %73 to i64 | |
%75 = getelementptr inbounds float, float* %42, i64 %74 | |
%76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8 | |
%77 = add nuw nsw i32 %.047.epil, 32768 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !60 | |
._crit_edge.loopexit59.epilog-lcssa: ; preds = %.lr.ph.split.epil | |
br label %._crit_edge.loopexit59 | |
._crit_edge.loopexit59: ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29 | |
ret void | |
.lr.ph.split: ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new | |
%.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ] | |
%78 = srem i32 %.047, %3 | |
%79 = sext i32 %78 to i64 | |
%80 = getelementptr inbounds float, float* %42, i64 %79 | |
%81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8 | |
%82 = add nuw nsw i32 %.047, 32768 | |
%83 = srem i32 %82, %3 | |
%84 = sext i32 %83 to i64 | |
%85 = getelementptr inbounds float, float* %42, i64 %84 | |
%86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8 | |
%87 = add nsw i32 %.047, 65536 | |
%88 = srem i32 %87, %3 | |
%89 = sext i32 %88 to i64 | |
%90 = getelementptr inbounds float, float* %42, i64 %89 | |
%91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8 | |
%92 = add nsw i32 %.047, 98304 | |
%93 = srem i32 %92, %3 | |
%94 = sext i32 %93 to i64 | |
%95 = getelementptr inbounds float, float* %42, i64 %94 | |
%96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8 | |
%97 = add nsw i32 %.047, 131072 | |
%niter.nsub.3 = add i32 %niter, -4 | |
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 | |
br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split | |
; <label>:98: ; preds = %62 | |
%99 = mul nsw i32 %66, %3 | |
%100 = add nsw i32 %99, %49 | |
%101 = sext i32 %100 to i64 | |
%102 = getelementptr inbounds float, float* %40, i64 %101 | |
%103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8 | |
br label %104 | |
; <label>:104: ; preds = %98, %62 | |
%105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ] | |
%106 = fadd float %64, %105 | |
%107 = add nsw i32 %.04346.us.us, 2 | |
%exitcond.1 = icmp eq i32 %107, 16 | |
br i1 %exitcond.1, label %.us-lcssa.us.us, label %53 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 128 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %2, 15 | |
%31 = sdiv i32 %30, 16 | |
%32 = mul nsw i32 %31, %3 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = shl nuw nsw i32 %33, 8 | |
%36 = add nuw nsw i32 %35, %34 | |
%37 = icmp slt i32 %36, %32 | |
br i1 %37, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0 | |
%38 = icmp sgt i32 %3, -1 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0 | |
%40 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0 | |
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader | |
.lr.ph.split.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split | |
.lr.ph.split.us.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split.us | |
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
%.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ] | |
%41 = srem i32 %.048.us, %3 | |
%42 = sdiv i32 %.048.us, %3 | |
%43 = srem i32 %42, %31 | |
%44 = shl nsw i32 %43, 4 | |
%.idx45.val.us = load float, float* %.idx45, align 4 | |
%45 = load float*, float** %39, align 8 | |
br label %54 | |
; <label>:46: ; preds = %49, %.us-lcssa.us.us | |
%.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ] | |
%47 = bitcast i32 %.011.i.us to float | |
%48 = fcmp olt float %47, %.lcssa | |
br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
; <label>:49: ; preds = %46 | |
%50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst | |
%51 = extractvalue { i32, i1 } %50, 0 | |
%not..i.us = icmp eq i32 %.011.i.us, %51 | |
br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46 | |
%52 = add nuw nsw i32 %.048.us, 32768 | |
%53 = icmp slt i32 %52, %32 | |
br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit | |
; <label>:54: ; preds = %112, %.lr.ph.split.us | |
%.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ] | |
%.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ] | |
%55 = add nuw nsw i32 %.04347.us.us, %44 | |
%56 = icmp slt i32 %55, %2 | |
br i1 %56, label %57, label %63 | |
; <label>:57: ; preds = %54 | |
%58 = mul nsw i32 %55, %3 | |
%59 = add nsw i32 %58, %41 | |
%60 = sext i32 %59 to i64 | |
%61 = getelementptr inbounds float, float* %45, i64 %60 | |
%62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8 | |
br label %63 | |
; <label>:63: ; preds = %54, %57 | |
%64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ] | |
%65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8 | |
%66 = or i32 %.04347.us.us, 1 | |
%67 = add nuw nsw i32 %66, %44 | |
%68 = icmp slt i32 %67, %2 | |
br i1 %68, label %106, label %112 | |
.us-lcssa.us.us: ; preds = %112 | |
%.lcssa = phi float [ %114, %112 ] | |
%69 = load float*, float** %40, align 8 | |
%70 = sext i32 %41 to i64 | |
%71 = getelementptr inbounds float, float* %69, i64 %70 | |
%72 = bitcast float %.lcssa to i32 | |
%73 = bitcast float* %71 to i32* | |
%74 = load i32, i32* %73, align 4 | |
br label %46 | |
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
br label %._crit_edge | |
._crit_edge.loopexit60: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29 | |
ret void | |
.lr.ph.split: ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
%.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ] | |
%.idx45.val = load float, float* %.idx45, align 4 | |
%75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8 | |
%76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8 | |
%77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8 | |
%78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8 | |
%79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8 | |
%80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8 | |
%81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8 | |
%82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8 | |
%83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8 | |
%84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8 | |
%85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8 | |
%86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8 | |
%87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8 | |
%88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8 | |
%89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8 | |
%90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8 | |
%91 = srem i32 %.048, %3 | |
%92 = load float*, float** %40, align 8 | |
%93 = sext i32 %91 to i64 | |
%94 = getelementptr inbounds float, float* %92, i64 %93 | |
%95 = bitcast float %90 to i32 | |
%96 = bitcast float* %94 to i32* | |
%97 = load i32, i32* %96, align 4 | |
br label %98 | |
; <label>:98: ; preds = %101, %.lr.ph.split | |
%.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ] | |
%99 = bitcast i32 %.011.i to float | |
%100 = fcmp olt float %99, %90 | |
br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
; <label>:101: ; preds = %98 | |
%102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst | |
%103 = extractvalue { i32, i1 } %102, 0 | |
%not..i = icmp eq i32 %.011.i, %103 | |
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101 | |
%104 = add nuw nsw i32 %.048, 32768 | |
%105 = icmp slt i32 %104, %32 | |
br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60 | |
; <label>:106: ; preds = %63 | |
%107 = mul nsw i32 %67, %3 | |
%108 = add nsw i32 %107, %41 | |
%109 = sext i32 %108 to i64 | |
%110 = getelementptr inbounds float, float* %45, i64 %109 | |
%111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8 | |
br label %112 | |
; <label>:112: ; preds = %106, %63 | |
%113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ] | |
%114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8 | |
%115 = add nsw i32 %.04347.us.us, 2 | |
%exitcond.1 = icmp eq i32 %115, 16 | |
br i1 %exitcond.1, label %.us-lcssa.us.us, label %54 | |
} | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.6"* byval align 8, i32) #0 comdat { | |
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%5 = mul nuw nsw i32 %4, %3 | |
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%7 = add nuw nsw i32 %5, %6 | |
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%9 = mul nuw nsw i32 %8, %4 | |
%.sroa.021.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %0, i64 0, i32 0, i32 0 | |
%.sroa.021.0.copyload = load float*, float** %.sroa.021.0..sroa_idx, align 8 | |
%.sroa.5.0..sroa_idx25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.6", %"struct.Eigen::TensorEvaluator.6"* %0, i64 0, i32 1, i32 3 | |
%.sroa.5.0.copyload = load float*, float** %.sroa.5.0..sroa_idx25, align 8 | |
%10 = icmp slt i32 %7, %1 | |
br i1 %10, label %.lr.ph.i.preheader, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit | |
.lr.ph.i.preheader: ; preds = %2 | |
br label %.lr.ph.i | |
.lr.ph.i: ; preds = %.lr.ph.i.preheader, %.lr.ph.i | |
%.07.i = phi i32 [ %17, %.lr.ph.i ], [ %7, %.lr.ph.i.preheader ] | |
%11 = sext i32 %.07.i to i64 | |
%12 = getelementptr inbounds float, float* %.sroa.5.0.copyload, i64 %11 | |
%13 = bitcast float* %12 to i32* | |
%14 = load i32, i32* %13, align 4 | |
%15 = getelementptr inbounds float, float* %.sroa.021.0.copyload, i64 %11 | |
%16 = bitcast float* %15 to i32* | |
store i32 %14, i32* %16, align 4 | |
%17 = add nsw i32 %.07.i, %9 | |
%18 = icmp slt i32 %17, %1 | |
br i1 %18, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit: ; preds = %.lr.ph.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit, %2 | |
ret void | |
} | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.11"* byval align 8, i32) #0 comdat { | |
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%5 = mul nuw nsw i32 %4, %3 | |
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%7 = add nuw nsw i32 %5, %6 | |
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%9 = mul nuw nsw i32 %8, %4 | |
%.sroa.041.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 0, i32 0 | |
%.sroa.041.0.copyload = load float*, float** %.sroa.041.0..sroa_idx, align 8 | |
%.sroa.545.0..sroa_idx46 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 1, i32 7 | |
%.sroa.545.0..sroa_cast = bitcast i32* %.sroa.545.0..sroa_idx46 to i64* | |
%.sroa.545.0.copyload = load i64, i64* %.sroa.545.0..sroa_cast, align 8 | |
%.sroa.648.0..sroa_idx49 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 1, i32 9, i32 0, i64 0 | |
%.sroa.648.0.copyload = load i32, i32* %.sroa.648.0..sroa_idx49, align 8 | |
%.sroa.8.0..sroa_idx53 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.11", %"struct.Eigen::TensorEvaluator.11"* %0, i64 0, i32 1, i32 10, i32 0 | |
%.sroa.8.0.copyload = load float*, float** %.sroa.8.0..sroa_idx53, align 8 | |
%10 = icmp slt i32 %7, %1 | |
br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit | |
.lr.ph.i: ; preds = %2 | |
%11 = trunc i64 %.sroa.545.0.copyload to i32 | |
%12 = icmp sgt i32 %.sroa.648.0.copyload, 0 | |
%13 = lshr i64 %.sroa.545.0.copyload, 32 | |
%14 = trunc i64 %13 to i32 | |
br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader | |
.lr.ph.split.i.preheader: ; preds = %.lr.ph.i | |
br label %.lr.ph.split.i | |
.lr.ph.split.us.i.preheader: ; preds = %.lr.ph.i | |
%15 = add i32 %.sroa.648.0.copyload, -1 | |
%xtraiter = and i32 %.sroa.648.0.copyload, 3 | |
%16 = icmp ult i32 %15, 3 | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
%unroll_iter = sub i32 %.sroa.648.0.copyload, %xtraiter | |
br label %.lr.ph.split.us.i | |
.lr.ph.split.us.i: ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
%.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ] | |
%17 = mul nsw i32 %.07.us.i, %11 | |
br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new | |
.lr.ph.split.us.i.new: ; preds = %.lr.ph.split.us.i | |
br label %18 | |
; <label>:18: ; preds = %18, %.lr.ph.split.us.i.new | |
%19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ] | |
%.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ] | |
%20 = mul nsw i32 %.012.i.i.i.us.i, %14 | |
%21 = add nsw i32 %20, %17 | |
%22 = sext i32 %21 to i64 | |
%23 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %22 | |
%24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8 | |
%25 = fadd float %19, %24 | |
%26 = or i32 %.012.i.i.i.us.i, 1 | |
%27 = mul nsw i32 %26, %14 | |
%28 = add nsw i32 %27, %17 | |
%29 = sext i32 %28 to i64 | |
%30 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %29 | |
%31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8 | |
%32 = fadd float %25, %31 | |
%33 = or i32 %.012.i.i.i.us.i, 2 | |
%34 = mul nsw i32 %33, %14 | |
%35 = add nsw i32 %34, %17 | |
%36 = sext i32 %35 to i64 | |
%37 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %36 | |
%38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8 | |
%39 = fadd float %32, %38 | |
%40 = or i32 %.012.i.i.i.us.i, 3 | |
%41 = mul nsw i32 %40, %14 | |
%42 = add nsw i32 %41, %17 | |
%43 = sext i32 %42 to i64 | |
%44 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %43 | |
%45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8 | |
%46 = fadd float %39, %45 | |
%47 = add nsw i32 %.012.i.i.i.us.i, 4 | |
%niter.nsub.3 = add i32 %niter, -4 | |
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 | |
br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18 | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18 | |
%.lcssa67 = phi i32 [ %47, %18 ] | |
%.lcssa66 = phi float [ %46, %18 ] | |
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i | |
%.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
%.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
%.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader | |
.epil.preheader: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa | |
br label %48 | |
; <label>:48: ; preds = %48, %.epil.preheader | |
%49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ] | |
%.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ] | |
%epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ] | |
%50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14 | |
%51 = add nsw i32 %50, %17 | |
%52 = sext i32 %51 to i64 | |
%53 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %52 | |
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8 | |
%55 = fadd float %49, %54 | |
%56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !61 | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48 | |
%.lcssa68 = phi float [ %55, %48 ] | |
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa | |
%.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa68, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ] | |
%57 = sext i32 %.07.us.i to i64 | |
%58 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %57 | |
store float %.lcssa, float* %58, align 4 | |
%59 = add nsw i32 %.07.us.i, %9 | |
%60 = icmp slt i32 %59, %1 | |
br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit | |
.lr.ph.split.i: ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i | |
%.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ] | |
%61 = sext i32 %.07.i to i64 | |
%62 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %61 | |
store float 0.000000e+00, float* %62, align 4 | |
%63 = add nsw i32 %.07.i, %9 | |
%64 = icmp slt i32 %63, %1 | |
br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64 | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64: ; preds = %.lr.ph.split.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit, %2 | |
ret void | |
} | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #0 comdat { | |
%4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%6 = mul nuw nsw i32 %5, %4 | |
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%8 = add nuw nsw i32 %6, %7 | |
%9 = icmp slt i32 %8, %1 | |
br i1 %9, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %3 | |
%10 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %2, i64 0, i32 0 | |
%11 = load float*, float** %10, align 8 | |
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%13 = mul nuw nsw i32 %12, %5 | |
br label %14 | |
._crit_edge.loopexit: ; preds = %14 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %3 | |
ret void | |
; <label>:14: ; preds = %.lr.ph, %14 | |
%.08 = phi i32 [ %8, %.lr.ph ], [ %17, %14 ] | |
%15 = sext i32 %.08 to i64 | |
%16 = getelementptr inbounds float, float* %11, i64 %15 | |
store float %0, float* %16, align 4 | |
%17 = add i32 %13, %.08 | |
%18 = icmp slt i32 %17, %1 | |
br i1 %18, label %14, label %._crit_edge.loopexit | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 32 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %3, 32767 | |
%31 = sdiv i32 %30, 32768 | |
%32 = mul nsw i32 %31, %2 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = icmp slt i32 %33, %32 | |
br i1 %35, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%36 = and i32 %34, 31 | |
%.not = icmp ne i32 %36, 0 | |
%37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0 | |
%38 = load float*, float** %37, align 8 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0 | |
%40 = load float*, float** %39, align 8 | |
br label %41 | |
._crit_edge.loopexit: ; preds = %187 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %29 | |
ret void | |
; <label>:41: ; preds = %.lr.ph, %187 | |
%.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ] | |
%42 = srem i32 %.0114, %31 | |
%43 = sdiv i32 %.0114, %31 | |
%44 = shl nsw i32 %42, 15 | |
%45 = or i32 %44, %34 | |
%46 = icmp slt i32 %43, %2 | |
br i1 %46, label %.preheader102, label %.thread.preheader | |
.thread.preheader.loopexit: ; preds = %.preheader.preheader | |
%.lcssa137 = phi float [ %164, %.preheader.preheader ] | |
br label %.thread.preheader | |
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41 | |
%.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ] | |
br label %.thread | |
.preheader102: ; preds = %41 | |
%47 = mul nsw i32 %43, %3 | |
%48 = add i32 %47, %45 | |
br label %49 | |
; <label>:49: ; preds = %.preheader102, %.preheader.preheader | |
%.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ] | |
%.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ] | |
%50 = add nuw nsw i32 %.098108, 16 | |
%51 = shl i32 %.098108, 8 | |
%52 = or i32 %51, 3840 | |
%53 = add nsw i32 %52, %45 | |
%54 = icmp slt i32 %53, %3 | |
br i1 %54, label %.preheader.preheader, label %.preheader101 | |
.preheader.preheader: ; preds = %49 | |
%55 = add i32 %48, %51 | |
%56 = sext i32 %55 to i64 | |
%57 = getelementptr inbounds float, float* %40, i64 %56 | |
%58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8 | |
%59 = fadd float %.095109, %58 | |
%60 = shl i32 %.098108, 8 | |
%61 = or i32 %60, 256 | |
%62 = add i32 %48, %61 | |
%63 = sext i32 %62 to i64 | |
%64 = getelementptr inbounds float, float* %40, i64 %63 | |
%65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8 | |
%66 = fadd float %59, %65 | |
%67 = shl i32 %.098108, 8 | |
%68 = or i32 %67, 512 | |
%69 = add i32 %48, %68 | |
%70 = sext i32 %69 to i64 | |
%71 = getelementptr inbounds float, float* %40, i64 %70 | |
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8 | |
%73 = fadd float %66, %72 | |
%74 = shl i32 %.098108, 8 | |
%75 = or i32 %74, 768 | |
%76 = add i32 %48, %75 | |
%77 = sext i32 %76 to i64 | |
%78 = getelementptr inbounds float, float* %40, i64 %77 | |
%79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8 | |
%80 = fadd float %73, %79 | |
%81 = shl i32 %.098108, 8 | |
%82 = or i32 %81, 1024 | |
%83 = add i32 %48, %82 | |
%84 = sext i32 %83 to i64 | |
%85 = getelementptr inbounds float, float* %40, i64 %84 | |
%86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8 | |
%87 = fadd float %80, %86 | |
%88 = shl i32 %.098108, 8 | |
%89 = or i32 %88, 1280 | |
%90 = add i32 %48, %89 | |
%91 = sext i32 %90 to i64 | |
%92 = getelementptr inbounds float, float* %40, i64 %91 | |
%93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8 | |
%94 = fadd float %87, %93 | |
%95 = shl i32 %.098108, 8 | |
%96 = or i32 %95, 1536 | |
%97 = add i32 %48, %96 | |
%98 = sext i32 %97 to i64 | |
%99 = getelementptr inbounds float, float* %40, i64 %98 | |
%100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8 | |
%101 = fadd float %94, %100 | |
%102 = shl i32 %.098108, 8 | |
%103 = or i32 %102, 1792 | |
%104 = add i32 %48, %103 | |
%105 = sext i32 %104 to i64 | |
%106 = getelementptr inbounds float, float* %40, i64 %105 | |
%107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8 | |
%108 = fadd float %101, %107 | |
%109 = shl i32 %.098108, 8 | |
%110 = or i32 %109, 2048 | |
%111 = add i32 %48, %110 | |
%112 = sext i32 %111 to i64 | |
%113 = getelementptr inbounds float, float* %40, i64 %112 | |
%114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8 | |
%115 = fadd float %108, %114 | |
%116 = shl i32 %.098108, 8 | |
%117 = or i32 %116, 2304 | |
%118 = add i32 %48, %117 | |
%119 = sext i32 %118 to i64 | |
%120 = getelementptr inbounds float, float* %40, i64 %119 | |
%121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8 | |
%122 = fadd float %115, %121 | |
%123 = shl i32 %.098108, 8 | |
%124 = or i32 %123, 2560 | |
%125 = add i32 %48, %124 | |
%126 = sext i32 %125 to i64 | |
%127 = getelementptr inbounds float, float* %40, i64 %126 | |
%128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8 | |
%129 = fadd float %122, %128 | |
%130 = shl i32 %.098108, 8 | |
%131 = or i32 %130, 2816 | |
%132 = add i32 %48, %131 | |
%133 = sext i32 %132 to i64 | |
%134 = getelementptr inbounds float, float* %40, i64 %133 | |
%135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8 | |
%136 = fadd float %129, %135 | |
%137 = shl i32 %.098108, 8 | |
%138 = or i32 %137, 3072 | |
%139 = add i32 %48, %138 | |
%140 = sext i32 %139 to i64 | |
%141 = getelementptr inbounds float, float* %40, i64 %140 | |
%142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8 | |
%143 = fadd float %136, %142 | |
%144 = shl i32 %.098108, 8 | |
%145 = or i32 %144, 3328 | |
%146 = add i32 %48, %145 | |
%147 = sext i32 %146 to i64 | |
%148 = getelementptr inbounds float, float* %40, i64 %147 | |
%149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8 | |
%150 = fadd float %143, %149 | |
%151 = shl i32 %.098108, 8 | |
%152 = or i32 %151, 3584 | |
%153 = add i32 %48, %152 | |
%154 = sext i32 %153 to i64 | |
%155 = getelementptr inbounds float, float* %40, i64 %154 | |
%156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8 | |
%157 = fadd float %150, %156 | |
%158 = shl i32 %.098108, 8 | |
%159 = or i32 %158, 3840 | |
%160 = add i32 %48, %159 | |
%161 = sext i32 %160 to i64 | |
%162 = getelementptr inbounds float, float* %40, i64 %161 | |
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8 | |
%164 = fadd float %157, %163 | |
%165 = icmp slt i32 %50, 128 | |
br i1 %165, label %49, label %.thread.preheader.loopexit | |
.preheader101: ; preds = %49 | |
%.lcssa = phi i32 [ %51, %49 ] | |
%.098108.lcssa = phi i32 [ %.098108, %49 ] | |
%.095109.lcssa = phi float [ %.095109, %49 ] | |
%166 = add nsw i32 %.lcssa, %45 | |
%167 = icmp slt i32 %166, %3 | |
br i1 %167, label %168, label %.thread.preheader | |
; <label>:168: ; preds = %.preheader101 | |
%169 = add nsw i32 %166, %47 | |
%170 = sext i32 %169 to i64 | |
%171 = getelementptr inbounds float, float* %40, i64 %170 | |
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8 | |
%173 = fadd float %.095109.lcssa, %172 | |
%174 = shl i32 %.098108.lcssa, 8 | |
%175 = or i32 %174, 256 | |
%176 = add nsw i32 %175, %45 | |
%177 = icmp slt i32 %176, %3 | |
br i1 %177, label %190, label %.thread.preheader | |
; <label>:178: ; preds = %.thread | |
%.lcssa138 = phi float [ %180, %.thread ] | |
%.not99 = xor i1 %46, true | |
%brmerge = or i1 %.not, %.not99 | |
br i1 %brmerge, label %187, label %183 | |
.thread: ; preds = %.thread.preheader, %.thread | |
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ] | |
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ] | |
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53 | |
%180 = fadd float %.8112, %179 | |
%181 = lshr i32 %.092113, 1 | |
%182 = icmp eq i32 %181, 0 | |
br i1 %182, label %178, label %.thread, !llvm.loop !62 | |
; <label>:183: ; preds = %178 | |
%184 = sext i32 %43 to i64 | |
%185 = getelementptr inbounds float, float* %38, i64 %184 | |
%186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8 | |
br label %187 | |
; <label>:187: ; preds = %178, %183 | |
%188 = add nuw nsw i32 %.0114, 32 | |
%189 = icmp slt i32 %188, %32 | |
br i1 %189, label %41, label %._crit_edge.loopexit | |
; <label>:190: ; preds = %168 | |
%191 = add nsw i32 %176, %47 | |
%192 = sext i32 %191 to i64 | |
%193 = getelementptr inbounds float, float* %40, i64 %192 | |
%194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8 | |
%195 = fadd float %173, %194 | |
%196 = shl i32 %.098108.lcssa, 8 | |
%197 = or i32 %196, 512 | |
%198 = add nsw i32 %197, %45 | |
%199 = icmp slt i32 %198, %3 | |
br i1 %199, label %200, label %.thread.preheader | |
; <label>:200: ; preds = %190 | |
%201 = add nsw i32 %198, %47 | |
%202 = sext i32 %201 to i64 | |
%203 = getelementptr inbounds float, float* %40, i64 %202 | |
%204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8 | |
%205 = fadd float %195, %204 | |
%206 = shl i32 %.098108.lcssa, 8 | |
%207 = or i32 %206, 768 | |
%208 = add nsw i32 %207, %45 | |
%209 = icmp slt i32 %208, %3 | |
br i1 %209, label %210, label %.thread.preheader | |
; <label>:210: ; preds = %200 | |
%211 = add nsw i32 %208, %47 | |
%212 = sext i32 %211 to i64 | |
%213 = getelementptr inbounds float, float* %40, i64 %212 | |
%214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8 | |
%215 = fadd float %205, %214 | |
%216 = shl i32 %.098108.lcssa, 8 | |
%217 = or i32 %216, 1024 | |
%218 = add nsw i32 %217, %45 | |
%219 = icmp slt i32 %218, %3 | |
br i1 %219, label %220, label %.thread.preheader | |
; <label>:220: ; preds = %210 | |
%221 = add nsw i32 %218, %47 | |
%222 = sext i32 %221 to i64 | |
%223 = getelementptr inbounds float, float* %40, i64 %222 | |
%224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8 | |
%225 = fadd float %215, %224 | |
%226 = shl i32 %.098108.lcssa, 8 | |
%227 = or i32 %226, 1280 | |
%228 = add nsw i32 %227, %45 | |
%229 = icmp slt i32 %228, %3 | |
br i1 %229, label %230, label %.thread.preheader | |
; <label>:230: ; preds = %220 | |
%231 = add nsw i32 %228, %47 | |
%232 = sext i32 %231 to i64 | |
%233 = getelementptr inbounds float, float* %40, i64 %232 | |
%234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8 | |
%235 = fadd float %225, %234 | |
%236 = shl i32 %.098108.lcssa, 8 | |
%237 = or i32 %236, 1536 | |
%238 = add nsw i32 %237, %45 | |
%239 = icmp slt i32 %238, %3 | |
br i1 %239, label %240, label %.thread.preheader | |
; <label>:240: ; preds = %230 | |
%241 = add nsw i32 %238, %47 | |
%242 = sext i32 %241 to i64 | |
%243 = getelementptr inbounds float, float* %40, i64 %242 | |
%244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8 | |
%245 = fadd float %235, %244 | |
%246 = shl i32 %.098108.lcssa, 8 | |
%247 = or i32 %246, 1792 | |
%248 = add nsw i32 %247, %45 | |
%249 = icmp slt i32 %248, %3 | |
br i1 %249, label %250, label %.thread.preheader | |
; <label>:250: ; preds = %240 | |
%251 = add nsw i32 %248, %47 | |
%252 = sext i32 %251 to i64 | |
%253 = getelementptr inbounds float, float* %40, i64 %252 | |
%254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8 | |
%255 = fadd float %245, %254 | |
%256 = shl i32 %.098108.lcssa, 8 | |
%257 = or i32 %256, 2048 | |
%258 = add nsw i32 %257, %45 | |
%259 = icmp slt i32 %258, %3 | |
br i1 %259, label %260, label %.thread.preheader | |
; <label>:260: ; preds = %250 | |
%261 = add nsw i32 %258, %47 | |
%262 = sext i32 %261 to i64 | |
%263 = getelementptr inbounds float, float* %40, i64 %262 | |
%264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8 | |
%265 = fadd float %255, %264 | |
%266 = shl i32 %.098108.lcssa, 8 | |
%267 = or i32 %266, 2304 | |
%268 = add nsw i32 %267, %45 | |
%269 = icmp slt i32 %268, %3 | |
br i1 %269, label %270, label %.thread.preheader | |
; <label>:270: ; preds = %260 | |
%271 = add nsw i32 %268, %47 | |
%272 = sext i32 %271 to i64 | |
%273 = getelementptr inbounds float, float* %40, i64 %272 | |
%274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8 | |
%275 = fadd float %265, %274 | |
%276 = shl i32 %.098108.lcssa, 8 | |
%277 = or i32 %276, 2560 | |
%278 = add nsw i32 %277, %45 | |
%279 = icmp slt i32 %278, %3 | |
br i1 %279, label %280, label %.thread.preheader | |
; <label>:280: ; preds = %270 | |
%281 = add nsw i32 %278, %47 | |
%282 = sext i32 %281 to i64 | |
%283 = getelementptr inbounds float, float* %40, i64 %282 | |
%284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8 | |
%285 = fadd float %275, %284 | |
%286 = shl i32 %.098108.lcssa, 8 | |
%287 = or i32 %286, 2816 | |
%288 = add nsw i32 %287, %45 | |
%289 = icmp slt i32 %288, %3 | |
br i1 %289, label %290, label %.thread.preheader | |
; <label>:290: ; preds = %280 | |
%291 = add nsw i32 %288, %47 | |
%292 = sext i32 %291 to i64 | |
%293 = getelementptr inbounds float, float* %40, i64 %292 | |
%294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8 | |
%295 = fadd float %285, %294 | |
%296 = shl i32 %.098108.lcssa, 8 | |
%297 = or i32 %296, 3072 | |
%298 = add nsw i32 %297, %45 | |
%299 = icmp slt i32 %298, %3 | |
br i1 %299, label %300, label %.thread.preheader | |
; <label>:300: ; preds = %290 | |
%301 = add nsw i32 %298, %47 | |
%302 = sext i32 %301 to i64 | |
%303 = getelementptr inbounds float, float* %40, i64 %302 | |
%304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8 | |
%305 = fadd float %295, %304 | |
%306 = shl i32 %.098108.lcssa, 8 | |
%307 = or i32 %306, 3328 | |
%308 = add nsw i32 %307, %45 | |
%309 = icmp slt i32 %308, %3 | |
br i1 %309, label %310, label %.thread.preheader | |
; <label>:310: ; preds = %300 | |
%311 = add nsw i32 %308, %47 | |
%312 = sext i32 %311 to i64 | |
%313 = getelementptr inbounds float, float* %40, i64 %312 | |
%314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8 | |
%315 = fadd float %305, %314 | |
%316 = shl i32 %.098108.lcssa, 8 | |
%317 = or i32 %316, 3584 | |
%318 = add nsw i32 %317, %45 | |
%319 = icmp slt i32 %318, %3 | |
br i1 %319, label %320, label %.thread.preheader | |
; <label>:320: ; preds = %310 | |
%321 = add nsw i32 %318, %47 | |
%322 = sext i32 %321 to i64 | |
%323 = getelementptr inbounds float, float* %40, i64 %322 | |
%324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8 | |
%325 = fadd float %315, %324 | |
br label %.thread.preheader | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 32 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %3, 32767 | |
%31 = sdiv i32 %30, 32768 | |
%32 = mul nsw i32 %31, %2 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = icmp slt i32 %33, %32 | |
br i1 %35, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0 | |
%36 = and i32 %34, 31 | |
%.not = icmp ne i32 %36, 0 | |
%37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0 | |
%38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0 | |
br label %39 | |
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %29 | |
ret void | |
; <label>:39: ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
%.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ] | |
%40 = srem i32 %.0114, %31 | |
%41 = sdiv i32 %.0114, %31 | |
%42 = shl nsw i32 %40, 15 | |
%43 = or i32 %42, %34 | |
%.idx.val = load float, float* %.idx, align 4 | |
%44 = icmp slt i32 %41, %2 | |
br i1 %44, label %.preheader102, label %.thread.preheader | |
.thread.preheader.loopexit: ; preds = %.preheader.preheader | |
%.lcssa137 = phi float [ %163, %.preheader.preheader ] | |
br label %.thread.preheader | |
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39 | |
%.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ] | |
br label %.thread | |
.preheader102: ; preds = %39 | |
%45 = mul nsw i32 %41, %3 | |
%46 = add i32 %45, %43 | |
%47 = load float*, float** %38, align 8 | |
br label %48 | |
; <label>:48: ; preds = %.preheader102, %.preheader.preheader | |
%.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ] | |
%.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ] | |
%49 = add nuw nsw i32 %.098108, 16 | |
%50 = shl i32 %.098108, 8 | |
%51 = or i32 %50, 3840 | |
%52 = add nsw i32 %51, %43 | |
%53 = icmp slt i32 %52, %3 | |
br i1 %53, label %.preheader.preheader, label %.preheader101 | |
.preheader.preheader: ; preds = %48 | |
%54 = add i32 %46, %50 | |
%55 = sext i32 %54 to i64 | |
%56 = getelementptr inbounds float, float* %47, i64 %55 | |
%57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8 | |
%58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8 | |
%59 = shl i32 %.098108, 8 | |
%60 = or i32 %59, 256 | |
%61 = add i32 %46, %60 | |
%62 = sext i32 %61 to i64 | |
%63 = getelementptr inbounds float, float* %47, i64 %62 | |
%64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8 | |
%65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8 | |
%66 = shl i32 %.098108, 8 | |
%67 = or i32 %66, 512 | |
%68 = add i32 %46, %67 | |
%69 = sext i32 %68 to i64 | |
%70 = getelementptr inbounds float, float* %47, i64 %69 | |
%71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8 | |
%72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8 | |
%73 = shl i32 %.098108, 8 | |
%74 = or i32 %73, 768 | |
%75 = add i32 %46, %74 | |
%76 = sext i32 %75 to i64 | |
%77 = getelementptr inbounds float, float* %47, i64 %76 | |
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8 | |
%79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8 | |
%80 = shl i32 %.098108, 8 | |
%81 = or i32 %80, 1024 | |
%82 = add i32 %46, %81 | |
%83 = sext i32 %82 to i64 | |
%84 = getelementptr inbounds float, float* %47, i64 %83 | |
%85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8 | |
%86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8 | |
%87 = shl i32 %.098108, 8 | |
%88 = or i32 %87, 1280 | |
%89 = add i32 %46, %88 | |
%90 = sext i32 %89 to i64 | |
%91 = getelementptr inbounds float, float* %47, i64 %90 | |
%92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8 | |
%93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8 | |
%94 = shl i32 %.098108, 8 | |
%95 = or i32 %94, 1536 | |
%96 = add i32 %46, %95 | |
%97 = sext i32 %96 to i64 | |
%98 = getelementptr inbounds float, float* %47, i64 %97 | |
%99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8 | |
%100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8 | |
%101 = shl i32 %.098108, 8 | |
%102 = or i32 %101, 1792 | |
%103 = add i32 %46, %102 | |
%104 = sext i32 %103 to i64 | |
%105 = getelementptr inbounds float, float* %47, i64 %104 | |
%106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8 | |
%107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8 | |
%108 = shl i32 %.098108, 8 | |
%109 = or i32 %108, 2048 | |
%110 = add i32 %46, %109 | |
%111 = sext i32 %110 to i64 | |
%112 = getelementptr inbounds float, float* %47, i64 %111 | |
%113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8 | |
%114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8 | |
%115 = shl i32 %.098108, 8 | |
%116 = or i32 %115, 2304 | |
%117 = add i32 %46, %116 | |
%118 = sext i32 %117 to i64 | |
%119 = getelementptr inbounds float, float* %47, i64 %118 | |
%120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8 | |
%121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8 | |
%122 = shl i32 %.098108, 8 | |
%123 = or i32 %122, 2560 | |
%124 = add i32 %46, %123 | |
%125 = sext i32 %124 to i64 | |
%126 = getelementptr inbounds float, float* %47, i64 %125 | |
%127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8 | |
%128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8 | |
%129 = shl i32 %.098108, 8 | |
%130 = or i32 %129, 2816 | |
%131 = add i32 %46, %130 | |
%132 = sext i32 %131 to i64 | |
%133 = getelementptr inbounds float, float* %47, i64 %132 | |
%134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8 | |
%135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8 | |
%136 = shl i32 %.098108, 8 | |
%137 = or i32 %136, 3072 | |
%138 = add i32 %46, %137 | |
%139 = sext i32 %138 to i64 | |
%140 = getelementptr inbounds float, float* %47, i64 %139 | |
%141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8 | |
%142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8 | |
%143 = shl i32 %.098108, 8 | |
%144 = or i32 %143, 3328 | |
%145 = add i32 %46, %144 | |
%146 = sext i32 %145 to i64 | |
%147 = getelementptr inbounds float, float* %47, i64 %146 | |
%148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8 | |
%149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8 | |
%150 = shl i32 %.098108, 8 | |
%151 = or i32 %150, 3584 | |
%152 = add i32 %46, %151 | |
%153 = sext i32 %152 to i64 | |
%154 = getelementptr inbounds float, float* %47, i64 %153 | |
%155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8 | |
%156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8 | |
%157 = shl i32 %.098108, 8 | |
%158 = or i32 %157, 3840 | |
%159 = add i32 %46, %158 | |
%160 = sext i32 %159 to i64 | |
%161 = getelementptr inbounds float, float* %47, i64 %160 | |
%162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8 | |
%163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8 | |
%164 = icmp slt i32 %49, 128 | |
br i1 %164, label %48, label %.thread.preheader.loopexit | |
.preheader101: ; preds = %48 | |
%.lcssa = phi i32 [ %50, %48 ] | |
%.098108.lcssa = phi i32 [ %.098108, %48 ] | |
%.095109.lcssa = phi float [ %.095109, %48 ] | |
%165 = load float*, float** %38, align 8 | |
%166 = add nsw i32 %.lcssa, %43 | |
%167 = icmp slt i32 %166, %3 | |
br i1 %167, label %168, label %.thread.preheader | |
; <label>:168: ; preds = %.preheader101 | |
%169 = add nsw i32 %166, %45 | |
%170 = sext i32 %169 to i64 | |
%171 = getelementptr inbounds float, float* %165, i64 %170 | |
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8 | |
%173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8 | |
%174 = shl i32 %.098108.lcssa, 8 | |
%175 = or i32 %174, 256 | |
%176 = add nsw i32 %175, %43 | |
%177 = icmp slt i32 %176, %3 | |
br i1 %177, label %198, label %.thread.preheader | |
; <label>:178: ; preds = %.thread | |
%.lcssa138 = phi float [ %180, %.thread ] | |
%.not99 = xor i1 %44, true | |
%brmerge = or i1 %.not, %.not99 | |
br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183 | |
.thread: ; preds = %.thread.preheader, %.thread | |
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ] | |
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ] | |
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53 | |
%180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8 | |
%181 = lshr i32 %.092113, 1 | |
%182 = icmp eq i32 %181, 0 | |
br i1 %182, label %178, label %.thread, !llvm.loop !63 | |
; <label>:183: ; preds = %178 | |
%184 = sext i32 %41 to i64 | |
%185 = load float*, float** %37, align 8 | |
%186 = getelementptr inbounds float, float* %185, i64 %184 | |
%187 = bitcast float %.lcssa138 to i32 | |
%188 = bitcast float* %186 to i32* | |
%189 = load i32, i32* %188, align 4 | |
br label %190 | |
; <label>:190: ; preds = %193, %183 | |
%.011.i = phi i32 [ %189, %183 ], [ %195, %193 ] | |
%191 = bitcast i32 %.011.i to float | |
%192 = fcmp olt float %191, %.lcssa138 | |
br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit | |
; <label>:193: ; preds = %190 | |
%194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst | |
%195 = extractvalue { i32, i1 } %194, 0 | |
%not..i = icmp eq i32 %.011.i, %195 | |
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193 | |
br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178 | |
%196 = add nuw nsw i32 %.0114, 32 | |
%197 = icmp slt i32 %196, %32 | |
br i1 %197, label %39, label %._crit_edge.loopexit | |
; <label>:198: ; preds = %168 | |
%199 = add nsw i32 %176, %45 | |
%200 = sext i32 %199 to i64 | |
%201 = getelementptr inbounds float, float* %165, i64 %200 | |
%202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8 | |
%203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8 | |
%204 = shl i32 %.098108.lcssa, 8 | |
%205 = or i32 %204, 512 | |
%206 = add nsw i32 %205, %43 | |
%207 = icmp slt i32 %206, %3 | |
br i1 %207, label %208, label %.thread.preheader | |
; <label>:208: ; preds = %198 | |
%209 = add nsw i32 %206, %45 | |
%210 = sext i32 %209 to i64 | |
%211 = getelementptr inbounds float, float* %165, i64 %210 | |
%212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8 | |
%213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8 | |
%214 = shl i32 %.098108.lcssa, 8 | |
%215 = or i32 %214, 768 | |
%216 = add nsw i32 %215, %43 | |
%217 = icmp slt i32 %216, %3 | |
br i1 %217, label %218, label %.thread.preheader | |
; <label>:218: ; preds = %208 | |
%219 = add nsw i32 %216, %45 | |
%220 = sext i32 %219 to i64 | |
%221 = getelementptr inbounds float, float* %165, i64 %220 | |
%222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8 | |
%223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8 | |
%224 = shl i32 %.098108.lcssa, 8 | |
%225 = or i32 %224, 1024 | |
%226 = add nsw i32 %225, %43 | |
%227 = icmp slt i32 %226, %3 | |
br i1 %227, label %228, label %.thread.preheader | |
; <label>:228: ; preds = %218 | |
%229 = add nsw i32 %226, %45 | |
%230 = sext i32 %229 to i64 | |
%231 = getelementptr inbounds float, float* %165, i64 %230 | |
%232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8 | |
%233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8 | |
%234 = shl i32 %.098108.lcssa, 8 | |
%235 = or i32 %234, 1280 | |
%236 = add nsw i32 %235, %43 | |
%237 = icmp slt i32 %236, %3 | |
br i1 %237, label %238, label %.thread.preheader | |
; <label>:238: ; preds = %228 | |
%239 = add nsw i32 %236, %45 | |
%240 = sext i32 %239 to i64 | |
%241 = getelementptr inbounds float, float* %165, i64 %240 | |
%242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8 | |
%243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8 | |
%244 = shl i32 %.098108.lcssa, 8 | |
%245 = or i32 %244, 1536 | |
%246 = add nsw i32 %245, %43 | |
%247 = icmp slt i32 %246, %3 | |
br i1 %247, label %248, label %.thread.preheader | |
; <label>:248: ; preds = %238 | |
%249 = add nsw i32 %246, %45 | |
%250 = sext i32 %249 to i64 | |
%251 = getelementptr inbounds float, float* %165, i64 %250 | |
%252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8 | |
%253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8 | |
%254 = shl i32 %.098108.lcssa, 8 | |
%255 = or i32 %254, 1792 | |
%256 = add nsw i32 %255, %43 | |
%257 = icmp slt i32 %256, %3 | |
br i1 %257, label %258, label %.thread.preheader | |
; <label>:258: ; preds = %248 | |
%259 = add nsw i32 %256, %45 | |
%260 = sext i32 %259 to i64 | |
%261 = getelementptr inbounds float, float* %165, i64 %260 | |
%262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8 | |
%263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8 | |
%264 = shl i32 %.098108.lcssa, 8 | |
%265 = or i32 %264, 2048 | |
%266 = add nsw i32 %265, %43 | |
%267 = icmp slt i32 %266, %3 | |
br i1 %267, label %268, label %.thread.preheader | |
; <label>:268: ; preds = %258 | |
%269 = add nsw i32 %266, %45 | |
%270 = sext i32 %269 to i64 | |
%271 = getelementptr inbounds float, float* %165, i64 %270 | |
%272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8 | |
%273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8 | |
%274 = shl i32 %.098108.lcssa, 8 | |
%275 = or i32 %274, 2304 | |
%276 = add nsw i32 %275, %43 | |
%277 = icmp slt i32 %276, %3 | |
br i1 %277, label %278, label %.thread.preheader | |
; <label>:278: ; preds = %268 | |
%279 = add nsw i32 %276, %45 | |
%280 = sext i32 %279 to i64 | |
%281 = getelementptr inbounds float, float* %165, i64 %280 | |
%282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8 | |
%283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8 | |
%284 = shl i32 %.098108.lcssa, 8 | |
%285 = or i32 %284, 2560 | |
%286 = add nsw i32 %285, %43 | |
%287 = icmp slt i32 %286, %3 | |
br i1 %287, label %288, label %.thread.preheader | |
; <label>:288: ; preds = %278 | |
%289 = add nsw i32 %286, %45 | |
%290 = sext i32 %289 to i64 | |
%291 = getelementptr inbounds float, float* %165, i64 %290 | |
%292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8 | |
%293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8 | |
%294 = shl i32 %.098108.lcssa, 8 | |
%295 = or i32 %294, 2816 | |
%296 = add nsw i32 %295, %43 | |
%297 = icmp slt i32 %296, %3 | |
br i1 %297, label %298, label %.thread.preheader | |
; <label>:298: ; preds = %288 | |
%299 = add nsw i32 %296, %45 | |
%300 = sext i32 %299 to i64 | |
%301 = getelementptr inbounds float, float* %165, i64 %300 | |
%302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8 | |
%303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8 | |
%304 = shl i32 %.098108.lcssa, 8 | |
%305 = or i32 %304, 3072 | |
%306 = add nsw i32 %305, %43 | |
%307 = icmp slt i32 %306, %3 | |
br i1 %307, label %308, label %.thread.preheader | |
; <label>:308: ; preds = %298 | |
%309 = add nsw i32 %306, %45 | |
%310 = sext i32 %309 to i64 | |
%311 = getelementptr inbounds float, float* %165, i64 %310 | |
%312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8 | |
%313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8 | |
%314 = shl i32 %.098108.lcssa, 8 | |
%315 = or i32 %314, 3328 | |
%316 = add nsw i32 %315, %43 | |
%317 = icmp slt i32 %316, %3 | |
br i1 %317, label %318, label %.thread.preheader | |
; <label>:318: ; preds = %308 | |
%319 = add nsw i32 %316, %45 | |
%320 = sext i32 %319 to i64 | |
%321 = getelementptr inbounds float, float* %165, i64 %320 | |
%322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8 | |
%323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8 | |
%324 = shl i32 %.098108.lcssa, 8 | |
%325 = or i32 %324, 3584 | |
%326 = add nsw i32 %325, %43 | |
%327 = icmp slt i32 %326, %3 | |
br i1 %327, label %328, label %.thread.preheader | |
; <label>:328: ; preds = %318 | |
%329 = add nsw i32 %326, %45 | |
%330 = sext i32 %329 to i64 | |
%331 = getelementptr inbounds float, float* %165, i64 %330 | |
%332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8 | |
%333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8 | |
br label %.thread.preheader | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 128 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %2, 15 | |
%31 = sdiv i32 %30, 16 | |
%32 = mul nsw i32 %31, %3 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = shl nuw nsw i32 %33, 8 | |
%36 = add nuw nsw i32 %35, %34 | |
%37 = icmp slt i32 %36, %32 | |
br i1 %37, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%38 = icmp sgt i32 %3, -1 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0 | |
%40 = load float*, float** %39, align 8 | |
%41 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0 | |
%42 = load float*, float** %41, align 8 | |
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader | |
.lr.ph.split.preheader: ; preds = %.lr.ph | |
%43 = add i32 %32, -1 | |
%44 = sub i32 %43, %34 | |
%45 = sub i32 %44, %35 | |
%46 = lshr i32 %45, 15 | |
%47 = add nuw nsw i32 %46, 1 | |
%xtraiter = and i32 %47, 3 | |
%48 = icmp ult i32 %45, 98304 | |
br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new | |
.lr.ph.split.preheader.new: ; preds = %.lr.ph.split.preheader | |
%unroll_iter = sub nsw i32 %47, %xtraiter | |
br label %.lr.ph.split | |
.lr.ph.split.us.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split.us | |
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us | |
%.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ] | |
%49 = srem i32 %.047.us, %3 | |
%50 = sdiv i32 %.047.us, %3 | |
%51 = srem i32 %50, %31 | |
%52 = shl nsw i32 %51, 4 | |
br label %53 | |
; <label>:53: ; preds = %104, %.lr.ph.split.us | |
%.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ] | |
%.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ] | |
%54 = add nuw nsw i32 %.04346.us.us, %52 | |
%55 = icmp slt i32 %54, %2 | |
br i1 %55, label %56, label %62 | |
; <label>:56: ; preds = %53 | |
%57 = mul nsw i32 %54, %3 | |
%58 = add nsw i32 %57, %49 | |
%59 = sext i32 %58 to i64 | |
%60 = getelementptr inbounds float, float* %40, i64 %59 | |
%61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8 | |
br label %62 | |
; <label>:62: ; preds = %56, %53 | |
%63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ] | |
%64 = fadd float %.04445.us.us, %63 | |
%65 = or i32 %.04346.us.us, 1 | |
%66 = add nuw nsw i32 %65, %52 | |
%67 = icmp slt i32 %66, %2 | |
br i1 %67, label %98, label %104 | |
.us-lcssa.us.us: ; preds = %104 | |
%.lcssa = phi float [ %106, %104 ] | |
%68 = sext i32 %49 to i64 | |
%69 = getelementptr inbounds float, float* %42, i64 %68 | |
%70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8 | |
%71 = add nuw nsw i32 %.047.us, 32768 | |
%72 = icmp slt i32 %71, %32 | |
br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit | |
._crit_edge.loopexit: ; preds = %.us-lcssa.us.us | |
br label %._crit_edge | |
._crit_edge.loopexit59.unr-lcssa.loopexit: ; preds = %.lr.ph.split | |
%.lcssa60 = phi i32 [ %97, %.lr.ph.split ] | |
br label %._crit_edge.loopexit59.unr-lcssa | |
._crit_edge.loopexit59.unr-lcssa: ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader | |
%.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ] | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader | |
.lr.ph.split.epil.preheader: ; preds = %._crit_edge.loopexit59.unr-lcssa | |
br label %.lr.ph.split.epil | |
.lr.ph.split.epil: ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader | |
%.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ] | |
%epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ] | |
%73 = srem i32 %.047.epil, %3 | |
%74 = sext i32 %73 to i64 | |
%75 = getelementptr inbounds float, float* %42, i64 %74 | |
%76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8 | |
%77 = add nuw nsw i32 %.047.epil, 32768 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !64 | |
._crit_edge.loopexit59.epilog-lcssa: ; preds = %.lr.ph.split.epil | |
br label %._crit_edge.loopexit59 | |
._crit_edge.loopexit59: ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29 | |
ret void | |
.lr.ph.split: ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new | |
%.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ] | |
%78 = srem i32 %.047, %3 | |
%79 = sext i32 %78 to i64 | |
%80 = getelementptr inbounds float, float* %42, i64 %79 | |
%81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8 | |
%82 = add nuw nsw i32 %.047, 32768 | |
%83 = srem i32 %82, %3 | |
%84 = sext i32 %83 to i64 | |
%85 = getelementptr inbounds float, float* %42, i64 %84 | |
%86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8 | |
%87 = add nsw i32 %.047, 65536 | |
%88 = srem i32 %87, %3 | |
%89 = sext i32 %88 to i64 | |
%90 = getelementptr inbounds float, float* %42, i64 %89 | |
%91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8 | |
%92 = add nsw i32 %.047, 98304 | |
%93 = srem i32 %92, %3 | |
%94 = sext i32 %93 to i64 | |
%95 = getelementptr inbounds float, float* %42, i64 %94 | |
%96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8 | |
%97 = add nsw i32 %.047, 131072 | |
%niter.nsub.3 = add i32 %niter, -4 | |
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 | |
br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split | |
; <label>:98: ; preds = %62 | |
%99 = mul nsw i32 %66, %3 | |
%100 = add nsw i32 %99, %49 | |
%101 = sext i32 %100 to i64 | |
%102 = getelementptr inbounds float, float* %40, i64 %101 | |
%103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8 | |
br label %104 | |
; <label>:104: ; preds = %98, %62 | |
%105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ] | |
%106 = fadd float %64, %105 | |
%107 = add nsw i32 %.04346.us.us, 2 | |
%exitcond.1 = icmp eq i32 %107, 16 | |
br i1 %exitcond.1, label %.us-lcssa.us.us, label %53 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.3"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.7"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 128 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %2, 15 | |
%31 = sdiv i32 %30, 16 | |
%32 = mul nsw i32 %31, %3 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = shl nuw nsw i32 %33, 8 | |
%36 = add nuw nsw i32 %35, %34 | |
%37 = icmp slt i32 %36, %32 | |
br i1 %37, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0 | |
%38 = icmp sgt i32 %3, -1 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %1, i64 0, i32 0 | |
%40 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i64 0, i32 0 | |
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader | |
.lr.ph.split.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split | |
.lr.ph.split.us.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split.us | |
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
%.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ] | |
%41 = srem i32 %.048.us, %3 | |
%42 = sdiv i32 %.048.us, %3 | |
%43 = srem i32 %42, %31 | |
%44 = shl nsw i32 %43, 4 | |
%.idx45.val.us = load float, float* %.idx45, align 4 | |
%45 = load float*, float** %39, align 8 | |
br label %54 | |
; <label>:46: ; preds = %49, %.us-lcssa.us.us | |
%.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ] | |
%47 = bitcast i32 %.011.i.us to float | |
%48 = fcmp olt float %47, %.lcssa | |
br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
; <label>:49: ; preds = %46 | |
%50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst | |
%51 = extractvalue { i32, i1 } %50, 0 | |
%not..i.us = icmp eq i32 %.011.i.us, %51 | |
br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46 | |
%52 = add nuw nsw i32 %.048.us, 32768 | |
%53 = icmp slt i32 %52, %32 | |
br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit | |
; <label>:54: ; preds = %112, %.lr.ph.split.us | |
%.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ] | |
%.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ] | |
%55 = add nuw nsw i32 %.04347.us.us, %44 | |
%56 = icmp slt i32 %55, %2 | |
br i1 %56, label %57, label %63 | |
; <label>:57: ; preds = %54 | |
%58 = mul nsw i32 %55, %3 | |
%59 = add nsw i32 %58, %41 | |
%60 = sext i32 %59 to i64 | |
%61 = getelementptr inbounds float, float* %45, i64 %60 | |
%62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8 | |
br label %63 | |
; <label>:63: ; preds = %54, %57 | |
%64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ] | |
%65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8 | |
%66 = or i32 %.04347.us.us, 1 | |
%67 = add nuw nsw i32 %66, %44 | |
%68 = icmp slt i32 %67, %2 | |
br i1 %68, label %106, label %112 | |
.us-lcssa.us.us: ; preds = %112 | |
%.lcssa = phi float [ %114, %112 ] | |
%69 = sext i32 %41 to i64 | |
%70 = load float*, float** %40, align 8 | |
%71 = getelementptr inbounds float, float* %70, i64 %69 | |
%72 = bitcast float %.lcssa to i32 | |
%73 = bitcast float* %71 to i32* | |
%74 = load i32, i32* %73, align 4 | |
br label %46 | |
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
br label %._crit_edge | |
._crit_edge.loopexit60: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29 | |
ret void | |
.lr.ph.split: ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
%.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ] | |
%.idx45.val = load float, float* %.idx45, align 4 | |
%75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8 | |
%76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8 | |
%77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8 | |
%78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8 | |
%79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8 | |
%80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8 | |
%81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8 | |
%82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8 | |
%83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8 | |
%84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8 | |
%85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8 | |
%86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8 | |
%87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8 | |
%88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8 | |
%89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8 | |
%90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8 | |
%91 = srem i32 %.048, %3 | |
%92 = sext i32 %91 to i64 | |
%93 = load float*, float** %40, align 8 | |
%94 = getelementptr inbounds float, float* %93, i64 %92 | |
%95 = bitcast float %90 to i32 | |
%96 = bitcast float* %94 to i32* | |
%97 = load i32, i32* %96, align 4 | |
br label %98 | |
; <label>:98: ; preds = %101, %.lr.ph.split | |
%.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ] | |
%99 = bitcast i32 %.011.i to float | |
%100 = fcmp olt float %99, %90 | |
br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
; <label>:101: ; preds = %98 | |
%102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst | |
%103 = extractvalue { i32, i1 } %102, 0 | |
%not..i = icmp eq i32 %.011.i, %103 | |
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101 | |
%104 = add nuw nsw i32 %.048, 32768 | |
%105 = icmp slt i32 %104, %32 | |
br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60 | |
; <label>:106: ; preds = %63 | |
%107 = mul nsw i32 %67, %3 | |
%108 = add nsw i32 %107, %41 | |
%109 = sext i32 %108 to i64 | |
%110 = getelementptr inbounds float, float* %45, i64 %109 | |
%111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8 | |
br label %112 | |
; <label>:112: ; preds = %106, %63 | |
%113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ] | |
%114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8 | |
%115 = add nsw i32 %.04347.us.us, 2 | |
%exitcond.1 = icmp eq i32 %115, 16 | |
br i1 %exitcond.1, label %.us-lcssa.us.us, label %54 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, float*) #2 comdat { | |
%5 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%6 = shl nuw nsw i32 %5, 15 | |
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%8 = or i32 %6, %7 | |
%9 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%10 = icmp eq i32 %9, 1 | |
br i1 %10, label %11, label %15 | |
; <label>:11: ; preds = %4 | |
%12 = icmp eq i32 %8, 0 | |
br i1 %12, label %13, label %14 | |
; <label>:13: ; preds = %11 | |
store float 0.000000e+00, float* %3, align 4 | |
br label %14 | |
; <label>:14: ; preds = %13, %11 | |
tail call void @llvm.cuda.syncthreads() | |
br label %15 | |
; <label>:15: ; preds = %14, %4 | |
%16 = sub nsw i32 %2, %8 | |
%17 = icmp sgt i32 %16, 32768 | |
%..i = select i1 %17, i32 32768, i32 %16 | |
%18 = icmp sgt i32 %16, 0 | |
br i1 %18, label %.lr.ph, label %.preheader.preheader | |
.preheader.preheader.loopexit: ; preds = %.epil.preheader | |
%.lcssa47 = phi float [ %23, %.epil.preheader ] | |
br label %.preheader.preheader | |
.preheader.preheader: ; preds = %.preheader.preheader.loopexit, %.preheader.preheader.loopexit.unr-lcssa, %15 | |
%.132.ph = phi float [ 0.000000e+00, %15 ], [ %.lcssa36.ph, %.preheader.preheader.loopexit.unr-lcssa ], [ %.lcssa47, %.preheader.preheader.loopexit ] | |
br label %.preheader | |
.preheader.preheader.loopexit.unr-lcssa.loopexit: ; preds = %32 | |
%.lcssa49 = phi i32 [ %80, %32 ] | |
%.lcssa48 = phi float [ %79, %32 ] | |
br label %.preheader.preheader.loopexit.unr-lcssa | |
.preheader.preheader.loopexit.unr-lcssa: ; preds = %.preheader.preheader.loopexit.unr-lcssa.loopexit, %.lr.ph | |
%.lcssa36.ph = phi float [ undef, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ] | |
%.02535.unr = phi i32 [ 0, %.lr.ph ], [ %.lcssa49, %.preheader.preheader.loopexit.unr-lcssa.loopexit ] | |
%.03134.unr = phi float [ 0.000000e+00, %.lr.ph ], [ %.lcssa48, %.preheader.preheader.loopexit.unr-lcssa.loopexit ] | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
br i1 %lcmp.mod, label %.preheader.preheader, label %.epil.preheader.preheader | |
.epil.preheader.preheader: ; preds = %.preheader.preheader.loopexit.unr-lcssa | |
br label %.epil.preheader | |
.epil.preheader: ; preds = %.epil.preheader.preheader, %.epil.preheader | |
%.02535.epil = phi i32 [ %24, %.epil.preheader ], [ %.02535.unr, %.epil.preheader.preheader ] | |
%.03134.epil = phi float [ %23, %.epil.preheader ], [ %.03134.unr, %.epil.preheader.preheader ] | |
%epil.iter = phi i32 [ %epil.iter.sub, %.epil.preheader ], [ %xtraiter, %.epil.preheader.preheader ] | |
%19 = add nuw nsw i32 %.02535.epil, %8 | |
%20 = sext i32 %19 to i64 | |
%21 = getelementptr inbounds float, float* %26, i64 %20 | |
%22 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %21, i32 4) #8 | |
%23 = fadd float %.03134.epil, %22 | |
%24 = add nuw nsw i32 %.02535.epil, 256 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %.preheader.preheader.loopexit, label %.epil.preheader, !llvm.loop !65 | |
.lr.ph: ; preds = %15 | |
%25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i64 0, i32 10, i32 0 | |
%26 = load float*, float** %25, align 8 | |
%27 = icmp sgt i32 %..i, 256 | |
%smax = select i1 %27, i32 %..i, i32 256 | |
%28 = add i32 %smax, -1 | |
%29 = lshr i32 %28, 8 | |
%30 = add nuw nsw i32 %29, 1 | |
%xtraiter = and i32 %30, 7 | |
%31 = icmp ult i32 %28, 1792 | |
br i1 %31, label %.preheader.preheader.loopexit.unr-lcssa, label %.lr.ph.new | |
.lr.ph.new: ; preds = %.lr.ph | |
%unroll_iter = sub nsw i32 %30, %xtraiter | |
br label %32 | |
; <label>:32: ; preds = %32, %.lr.ph.new | |
%.02535 = phi i32 [ 0, %.lr.ph.new ], [ %80, %32 ] | |
%.03134 = phi float [ 0.000000e+00, %.lr.ph.new ], [ %79, %32 ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.new ], [ %niter.nsub.7, %32 ] | |
%33 = add nuw nsw i32 %.02535, %8 | |
%34 = sext i32 %33 to i64 | |
%35 = getelementptr inbounds float, float* %26, i64 %34 | |
%36 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %35, i32 4) #8 | |
%37 = fadd float %.03134, %36 | |
%38 = or i32 %.02535, 256 | |
%39 = add nuw nsw i32 %38, %8 | |
%40 = sext i32 %39 to i64 | |
%41 = getelementptr inbounds float, float* %26, i64 %40 | |
%42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8 | |
%43 = fadd float %37, %42 | |
%44 = or i32 %.02535, 512 | |
%45 = add nuw nsw i32 %44, %8 | |
%46 = sext i32 %45 to i64 | |
%47 = getelementptr inbounds float, float* %26, i64 %46 | |
%48 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %47, i32 4) #8 | |
%49 = fadd float %43, %48 | |
%50 = or i32 %.02535, 768 | |
%51 = add nuw nsw i32 %50, %8 | |
%52 = sext i32 %51 to i64 | |
%53 = getelementptr inbounds float, float* %26, i64 %52 | |
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8 | |
%55 = fadd float %49, %54 | |
%56 = or i32 %.02535, 1024 | |
%57 = add nuw nsw i32 %56, %8 | |
%58 = sext i32 %57 to i64 | |
%59 = getelementptr inbounds float, float* %26, i64 %58 | |
%60 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %59, i32 4) #8 | |
%61 = fadd float %55, %60 | |
%62 = or i32 %.02535, 1280 | |
%63 = add nuw nsw i32 %62, %8 | |
%64 = sext i32 %63 to i64 | |
%65 = getelementptr inbounds float, float* %26, i64 %64 | |
%66 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %65, i32 4) #8 | |
%67 = fadd float %61, %66 | |
%68 = or i32 %.02535, 1536 | |
%69 = add nuw nsw i32 %68, %8 | |
%70 = sext i32 %69 to i64 | |
%71 = getelementptr inbounds float, float* %26, i64 %70 | |
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8 | |
%73 = fadd float %67, %72 | |
%74 = or i32 %.02535, 1792 | |
%75 = add nuw nsw i32 %74, %8 | |
%76 = sext i32 %75 to i64 | |
%77 = getelementptr inbounds float, float* %26, i64 %76 | |
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8 | |
%79 = fadd float %73, %78 | |
%80 = add nsw i32 %.02535, 2048 | |
%niter.nsub.7 = add i32 %niter, -8 | |
%niter.ncmp.7 = icmp eq i32 %niter.nsub.7, 0 | |
br i1 %niter.ncmp.7, label %.preheader.preheader.loopexit.unr-lcssa.loopexit, label %32, !llvm.loop !66 | |
; <label>:81: ; preds = %.preheader | |
%.lcssa = phi float [ %85, %.preheader ] | |
%82 = and i32 %7, 31 | |
%83 = icmp eq i32 %82, 0 | |
br i1 %83, label %88, label %90 | |
.preheader: ; preds = %.preheader.preheader, %.preheader | |
%.033 = phi i32 [ %86, %.preheader ], [ 16, %.preheader.preheader ] | |
%.132 = phi float [ %85, %.preheader ], [ %.132.ph, %.preheader.preheader ] | |
%84 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.132, i32 %.033, i32 31) #3, !srcloc !53 | |
%85 = fadd float %.132, %84 | |
%86 = lshr i32 %.033, 1 | |
%87 = icmp eq i32 %86, 0 | |
br i1 %87, label %81, label %.preheader, !llvm.loop !67 | |
; <label>:88: ; preds = %81 | |
%89 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %3, float %.lcssa) #8 | |
br label %90 | |
; <label>:90: ; preds = %88, %81 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = shl nuw nsw i32 %6, 7 | |
%8 = add i32 %2, -1 | |
%9 = add i32 %8, %7 | |
%10 = udiv i32 %9, %7 | |
%11 = mul nsw i32 %10, %3 | |
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%13 = mul nuw nsw i32 %12, %6 | |
%14 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%15 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%16 = icmp eq i32 %12, 1 | |
br i1 %16, label %22, label %.preheader94 | |
.preheader94.loopexit: ; preds = %.lr.ph109 | |
br label %.preheader94 | |
.preheader94: ; preds = %.preheader94.loopexit, %22, %5 | |
%17 = icmp slt i32 %14, %11 | |
br i1 %17, label %.lr.ph106, label %._crit_edge | |
.lr.ph106: ; preds = %.preheader94 | |
%18 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i64 0, i32 10, i32 0 | |
%19 = load float*, float** %18, align 8 | |
%20 = and i32 %15, 31 | |
%21 = icmp eq i32 %20, 0 | |
br label %30 | |
; <label>:22: ; preds = %5 | |
%23 = mul nuw nsw i32 %14, %6 | |
%24 = add nuw nsw i32 %23, %15 | |
%25 = icmp slt i32 %24, %3 | |
br i1 %25, label %.lr.ph109.preheader, label %.preheader94 | |
.lr.ph109.preheader: ; preds = %22 | |
br label %.lr.ph109 | |
.lr.ph109: ; preds = %.lr.ph109.preheader, %.lr.ph109 | |
%.081107 = phi i32 [ %28, %.lr.ph109 ], [ %24, %.lr.ph109.preheader ] | |
%26 = sext i32 %.081107 to i64 | |
%27 = getelementptr inbounds float, float* %4, i64 %26 | |
store float 0.000000e+00, float* %27, align 4 | |
%28 = add nsw i32 %.081107, %13 | |
%29 = icmp slt i32 %28, %3 | |
br i1 %29, label %.lr.ph109, label %.preheader94.loopexit | |
._crit_edge.loopexit: ; preds = %177 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %.preheader94 | |
ret void | |
; <label>:30: ; preds = %.lr.ph106, %177 | |
%.083105 = phi i32 [ %14, %.lr.ph106 ], [ %178, %177 ] | |
%31 = sdiv i32 %.083105, %10 | |
%32 = icmp slt i32 %31, %3 | |
br i1 %32, label %33, label %177 | |
; <label>:33: ; preds = %30 | |
%34 = srem i32 %.083105, %10 | |
%35 = mul i32 %7, %34 | |
%36 = add i32 %35, %15 | |
%37 = mul nsw i32 %31, %2 | |
%38 = add i32 %36, %37 | |
br label %39 | |
; <label>:39: ; preds = %33, %.preheader.preheader | |
%.086100 = phi i32 [ 0, %33 ], [ %40, %.preheader.preheader ] | |
%.09299 = phi float [ 0.000000e+00, %33 ], [ %155, %.preheader.preheader ] | |
%40 = add nuw nsw i32 %.086100, 16 | |
%41 = or i32 %.086100, 15 | |
%42 = mul i32 %41, %6 | |
%43 = add i32 %42, %36 | |
%44 = icmp slt i32 %43, %2 | |
%45 = mul i32 %.086100, %6 | |
br i1 %44, label %.preheader.preheader, label %157 | |
.preheader.preheader: ; preds = %39 | |
%46 = add i32 %38, %45 | |
%47 = sext i32 %46 to i64 | |
%48 = getelementptr inbounds float, float* %19, i64 %47 | |
%49 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %48, i32 4) #8 | |
%50 = fadd float %.09299, %49 | |
%51 = or i32 %.086100, 1 | |
%52 = mul i32 %51, %6 | |
%53 = add i32 %38, %52 | |
%54 = sext i32 %53 to i64 | |
%55 = getelementptr inbounds float, float* %19, i64 %54 | |
%56 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %55, i32 4) #8 | |
%57 = fadd float %50, %56 | |
%58 = or i32 %.086100, 2 | |
%59 = mul i32 %58, %6 | |
%60 = add i32 %38, %59 | |
%61 = sext i32 %60 to i64 | |
%62 = getelementptr inbounds float, float* %19, i64 %61 | |
%63 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %62, i32 4) #8 | |
%64 = fadd float %57, %63 | |
%65 = or i32 %.086100, 3 | |
%66 = mul i32 %65, %6 | |
%67 = add i32 %38, %66 | |
%68 = sext i32 %67 to i64 | |
%69 = getelementptr inbounds float, float* %19, i64 %68 | |
%70 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %69, i32 4) #8 | |
%71 = fadd float %64, %70 | |
%72 = or i32 %.086100, 4 | |
%73 = mul i32 %72, %6 | |
%74 = add i32 %38, %73 | |
%75 = sext i32 %74 to i64 | |
%76 = getelementptr inbounds float, float* %19, i64 %75 | |
%77 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %76, i32 4) #8 | |
%78 = fadd float %71, %77 | |
%79 = or i32 %.086100, 5 | |
%80 = mul i32 %79, %6 | |
%81 = add i32 %38, %80 | |
%82 = sext i32 %81 to i64 | |
%83 = getelementptr inbounds float, float* %19, i64 %82 | |
%84 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %83, i32 4) #8 | |
%85 = fadd float %78, %84 | |
%86 = or i32 %.086100, 6 | |
%87 = mul i32 %86, %6 | |
%88 = add i32 %38, %87 | |
%89 = sext i32 %88 to i64 | |
%90 = getelementptr inbounds float, float* %19, i64 %89 | |
%91 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %90, i32 4) #8 | |
%92 = fadd float %85, %91 | |
%93 = or i32 %.086100, 7 | |
%94 = mul i32 %93, %6 | |
%95 = add i32 %38, %94 | |
%96 = sext i32 %95 to i64 | |
%97 = getelementptr inbounds float, float* %19, i64 %96 | |
%98 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %97, i32 4) #8 | |
%99 = fadd float %92, %98 | |
%100 = or i32 %.086100, 8 | |
%101 = mul i32 %100, %6 | |
%102 = add i32 %38, %101 | |
%103 = sext i32 %102 to i64 | |
%104 = getelementptr inbounds float, float* %19, i64 %103 | |
%105 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %104, i32 4) #8 | |
%106 = fadd float %99, %105 | |
%107 = or i32 %.086100, 9 | |
%108 = mul i32 %107, %6 | |
%109 = add i32 %38, %108 | |
%110 = sext i32 %109 to i64 | |
%111 = getelementptr inbounds float, float* %19, i64 %110 | |
%112 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %111, i32 4) #8 | |
%113 = fadd float %106, %112 | |
%114 = or i32 %.086100, 10 | |
%115 = mul i32 %114, %6 | |
%116 = add i32 %38, %115 | |
%117 = sext i32 %116 to i64 | |
%118 = getelementptr inbounds float, float* %19, i64 %117 | |
%119 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %118, i32 4) #8 | |
%120 = fadd float %113, %119 | |
%121 = or i32 %.086100, 11 | |
%122 = mul i32 %121, %6 | |
%123 = add i32 %38, %122 | |
%124 = sext i32 %123 to i64 | |
%125 = getelementptr inbounds float, float* %19, i64 %124 | |
%126 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %125, i32 4) #8 | |
%127 = fadd float %120, %126 | |
%128 = or i32 %.086100, 12 | |
%129 = mul i32 %128, %6 | |
%130 = add i32 %38, %129 | |
%131 = sext i32 %130 to i64 | |
%132 = getelementptr inbounds float, float* %19, i64 %131 | |
%133 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %132, i32 4) #8 | |
%134 = fadd float %127, %133 | |
%135 = or i32 %.086100, 13 | |
%136 = mul i32 %135, %6 | |
%137 = add i32 %38, %136 | |
%138 = sext i32 %137 to i64 | |
%139 = getelementptr inbounds float, float* %19, i64 %138 | |
%140 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %139, i32 4) #8 | |
%141 = fadd float %134, %140 | |
%142 = or i32 %.086100, 14 | |
%143 = mul i32 %142, %6 | |
%144 = add i32 %38, %143 | |
%145 = sext i32 %144 to i64 | |
%146 = getelementptr inbounds float, float* %19, i64 %145 | |
%147 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %146, i32 4) #8 | |
%148 = fadd float %141, %147 | |
%149 = or i32 %.086100, 15 | |
%150 = mul i32 %149, %6 | |
%151 = add i32 %38, %150 | |
%152 = sext i32 %151 to i64 | |
%153 = getelementptr inbounds float, float* %19, i64 %152 | |
%154 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %153, i32 4) #8 | |
%155 = fadd float %148, %154 | |
%156 = icmp slt i32 %40, 128 | |
br i1 %156, label %39, label %.critedge.loopexit125 | |
; <label>:157: ; preds = %39 | |
%.lcssa = phi i32 [ %45, %39 ] | |
%.09299.lcssa = phi float [ %.09299, %39 ] | |
%158 = add i32 %.lcssa, %36 | |
%159 = icmp slt i32 %158, %2 | |
br i1 %159, label %.lr.ph.preheader, label %.critedge | |
.lr.ph.preheader: ; preds = %157 | |
br label %.lr.ph | |
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph | |
%.084102 = phi i32 [ %165, %.lr.ph ], [ %158, %.lr.ph.preheader ] | |
%.1101 = phi float [ %164, %.lr.ph ], [ %.09299.lcssa, %.lr.ph.preheader ] | |
%160 = add nsw i32 %.084102, %37 | |
%161 = sext i32 %160 to i64 | |
%162 = getelementptr inbounds float, float* %19, i64 %161 | |
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8 | |
%164 = fadd float %.1101, %163 | |
%165 = add i32 %.084102, %6 | |
%166 = icmp slt i32 %165, %2 | |
br i1 %166, label %.lr.ph, label %.critedge.loopexit | |
.critedge.loopexit: ; preds = %.lr.ph | |
%.lcssa134 = phi float [ %164, %.lr.ph ] | |
br label %.critedge | |
.critedge.loopexit125: ; preds = %.preheader.preheader | |
%.lcssa133 = phi float [ %155, %.preheader.preheader ] | |
br label %.critedge | |
.critedge: ; preds = %.critedge.loopexit125, %.critedge.loopexit, %157 | |
%.3 = phi float [ %.09299.lcssa, %157 ], [ %.lcssa134, %.critedge.loopexit ], [ %.lcssa133, %.critedge.loopexit125 ] | |
tail call void @llvm.cuda.syncthreads() | |
br label %168 | |
; <label>:167: ; preds = %168 | |
%.lcssa135 = phi float [ %170, %168 ] | |
br i1 %21, label %173, label %177 | |
; <label>:168: ; preds = %.critedge, %168 | |
%.0104 = phi i32 [ 16, %.critedge ], [ %171, %168 ] | |
%.4103 = phi float [ %.3, %.critedge ], [ %170, %168 ] | |
%169 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.4103, i32 %.0104, i32 31) #3, !srcloc !53 | |
%170 = fadd float %.4103, %169 | |
%171 = lshr i32 %.0104, 1 | |
%172 = icmp eq i32 %171, 0 | |
br i1 %172, label %167, label %168 | |
; <label>:173: ; preds = %167 | |
%174 = sext i32 %31 to i64 | |
%175 = getelementptr inbounds float, float* %4, i64 %174 | |
%176 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %175, float %.lcssa135) #8 | |
br label %177 | |
; <label>:177: ; preds = %167, %173, %30 | |
tail call void @llvm.cuda.syncthreads() | |
%178 = add i32 %.083105, %12 | |
%179 = icmp slt i32 %178, %11 | |
br i1 %179, label %30, label %._crit_edge.loopexit | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE(%"struct.Eigen::internal::SumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.12"* byval align 8, i32, i32, float*) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%8 = mul nuw nsw i32 %7, %6 | |
%9 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%10 = mul nuw nsw i32 %9, %6 | |
%11 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%12 = add nuw nsw i32 %10, %11 | |
%13 = icmp eq i32 %7, 1 | |
br i1 %13, label %.preheader, label %19 | |
.preheader: ; preds = %5 | |
%14 = icmp slt i32 %12, %3 | |
br i1 %14, label %.lr.ph60.preheader, label %._crit_edge61 | |
.lr.ph60.preheader: ; preds = %.preheader | |
br label %.lr.ph60 | |
._crit_edge61.loopexit: ; preds = %.lr.ph60 | |
br label %._crit_edge61 | |
._crit_edge61: ; preds = %._crit_edge61.loopexit, %.preheader | |
tail call void @llvm.cuda.syncthreads() | |
br label %19 | |
.lr.ph60: ; preds = %.lr.ph60.preheader, %.lr.ph60 | |
%.059 = phi i32 [ %17, %.lr.ph60 ], [ %12, %.lr.ph60.preheader ] | |
%15 = sext i32 %.059 to i64 | |
%16 = getelementptr inbounds float, float* %4, i64 %15 | |
store float 0.000000e+00, float* %16, align 4 | |
%17 = add nsw i32 %.059, %8 | |
%18 = icmp slt i32 %17, %3 | |
br i1 %18, label %.lr.ph60, label %._crit_edge61.loopexit | |
; <label>:19: ; preds = %._crit_edge61, %5 | |
%20 = add i32 %2, 15 | |
%21 = sdiv i32 %20, 16 | |
%22 = mul nsw i32 %21, %3 | |
%23 = icmp slt i32 %12, %22 | |
br i1 %23, label %.lr.ph57, label %._crit_edge58 | |
.lr.ph57: ; preds = %19 | |
%24 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.12", %"struct.Eigen::TensorEvaluator.12"* %1, i64 0, i32 10, i32 0 | |
%25 = load float*, float** %24, align 8 | |
br label %26 | |
._crit_edge58.loopexit: ; preds = %._crit_edge | |
br label %._crit_edge58 | |
._crit_edge58: ; preds = %._crit_edge58.loopexit, %19 | |
ret void | |
; <label>:26: ; preds = %.lr.ph57, %._crit_edge | |
%.04755 = phi i32 [ %12, %.lr.ph57 ], [ %36, %._crit_edge ] | |
%27 = srem i32 %.04755, %3 | |
%28 = sdiv i32 %.04755, %3 | |
%29 = shl nsw i32 %28, 4 | |
%30 = add nsw i32 %29, 16 | |
%31 = icmp sgt i32 %30, %2 | |
%..i = select i1 %31, i32 %2, i32 %30 | |
%32 = icmp slt i32 %29, %..i | |
br i1 %32, label %.lr.ph.preheader, label %._crit_edge | |
.lr.ph.preheader: ; preds = %26 | |
br label %.lr.ph | |
._crit_edge.loopexit: ; preds = %.lr.ph | |
%.lcssa = phi float [ %43, %.lr.ph ] | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %26 | |
%.052.lcssa = phi float [ 0.000000e+00, %26 ], [ %.lcssa, %._crit_edge.loopexit ] | |
%33 = sext i32 %27 to i64 | |
%34 = getelementptr inbounds float, float* %4, i64 %33 | |
%35 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %34, float %.052.lcssa) #8 | |
%36 = add nsw i32 %.04755, %8 | |
%37 = icmp slt i32 %36, %22 | |
br i1 %37, label %26, label %._crit_edge58.loopexit | |
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph | |
%.04654 = phi i32 [ %44, %.lr.ph ], [ %29, %.lr.ph.preheader ] | |
%.05253 = phi float [ %43, %.lr.ph ], [ 0.000000e+00, %.lr.ph.preheader ] | |
%38 = mul nsw i32 %.04654, %3 | |
%39 = add nsw i32 %38, %27 | |
%40 = sext i32 %39 to i64 | |
%41 = getelementptr inbounds float, float* %25, i64 %40 | |
%42 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %41, i32 4) #8 | |
%43 = fadd float %.05253, %42 | |
%44 = add nsw i32 %.04654, 1 | |
%45 = icmp slt i32 %44, %..i | |
br i1 %45, label %.lr.ph, label %._crit_edge.loopexit | |
} | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.14"* byval align 8, i32) #0 comdat { | |
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%5 = mul nuw nsw i32 %4, %3 | |
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%7 = add nuw nsw i32 %5, %6 | |
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%9 = mul nuw nsw i32 %8, %4 | |
%.sroa.444.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 0, i32 7 | |
%.sroa.444.0..sroa_cast = bitcast i32* %.sroa.444.0..sroa_idx to i64* | |
%.sroa.444.0.copyload = load i64, i64* %.sroa.444.0..sroa_cast, align 8 | |
%.sroa.546.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 0, i32 9, i32 0, i64 0 | |
%.sroa.546.0.copyload = load i32, i32* %.sroa.546.0..sroa_idx, align 8 | |
%.sroa.750.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 0, i32 10, i32 0 | |
%.sroa.750.0.copyload = load float*, float** %.sroa.750.0..sroa_idx, align 8 | |
%.sroa.9.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.14", %"struct.Eigen::TensorEvaluator.14"* %0, i64 0, i32 2 | |
%.sroa.9.0.copyload = load float*, float** %.sroa.9.0..sroa_idx, align 8 | |
%10 = icmp slt i32 %7, %1 | |
br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit | |
.lr.ph.i: ; preds = %2 | |
%11 = trunc i64 %.sroa.444.0.copyload to i32 | |
%12 = icmp sgt i32 %.sroa.546.0.copyload, 0 | |
%13 = lshr i64 %.sroa.444.0.copyload, 32 | |
%14 = trunc i64 %13 to i32 | |
br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader | |
.lr.ph.split.i.preheader: ; preds = %.lr.ph.i | |
br label %.lr.ph.split.i | |
.lr.ph.split.us.i.preheader: ; preds = %.lr.ph.i | |
%15 = add i32 %.sroa.546.0.copyload, -1 | |
%xtraiter = and i32 %.sroa.546.0.copyload, 3 | |
%16 = icmp ult i32 %15, 3 | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
%unroll_iter = sub i32 %.sroa.546.0.copyload, %xtraiter | |
br label %.lr.ph.split.us.i | |
.lr.ph.split.us.i: ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
%.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ] | |
%17 = mul nsw i32 %.07.us.i, %11 | |
br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new | |
.lr.ph.split.us.i.new: ; preds = %.lr.ph.split.us.i | |
br label %18 | |
; <label>:18: ; preds = %18, %.lr.ph.split.us.i.new | |
%19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ] | |
%.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ] | |
%20 = mul nsw i32 %.012.i.i.i.us.i, %14 | |
%21 = add nsw i32 %20, %17 | |
%22 = sext i32 %21 to i64 | |
%23 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %22 | |
%24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8 | |
%25 = fadd float %19, %24 | |
%26 = or i32 %.012.i.i.i.us.i, 1 | |
%27 = mul nsw i32 %26, %14 | |
%28 = add nsw i32 %27, %17 | |
%29 = sext i32 %28 to i64 | |
%30 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %29 | |
%31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8 | |
%32 = fadd float %25, %31 | |
%33 = or i32 %.012.i.i.i.us.i, 2 | |
%34 = mul nsw i32 %33, %14 | |
%35 = add nsw i32 %34, %17 | |
%36 = sext i32 %35 to i64 | |
%37 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %36 | |
%38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8 | |
%39 = fadd float %32, %38 | |
%40 = or i32 %.012.i.i.i.us.i, 3 | |
%41 = mul nsw i32 %40, %14 | |
%42 = add nsw i32 %41, %17 | |
%43 = sext i32 %42 to i64 | |
%44 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %43 | |
%45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8 | |
%46 = fadd float %39, %45 | |
%47 = add nsw i32 %.012.i.i.i.us.i, 4 | |
%niter.nsub.3 = add i32 %niter, -4 | |
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 | |
br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18 | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18 | |
%.lcssa66 = phi i32 [ %47, %18 ] | |
%.lcssa65 = phi float [ %46, %18 ] | |
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i | |
%.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
%.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa65, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
%.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader | |
.epil.preheader: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa | |
br label %48 | |
; <label>:48: ; preds = %48, %.epil.preheader | |
%49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ] | |
%.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ] | |
%epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ] | |
%50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14 | |
%51 = add nsw i32 %50, %17 | |
%52 = sext i32 %51 to i64 | |
%53 = getelementptr inbounds float, float* %.sroa.750.0.copyload, i64 %52 | |
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8 | |
%55 = fadd float %49, %54 | |
%56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !68 | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48 | |
%.lcssa67 = phi float [ %55, %48 ] | |
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa | |
%.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ] | |
%57 = sext i32 %.07.us.i to i64 | |
%58 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %57 | |
store float %.lcssa, float* %58, align 4 | |
%59 = add nsw i32 %.07.us.i, %9 | |
%60 = icmp slt i32 %59, %1 | |
br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit | |
.lr.ph.split.i: ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i | |
%.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ] | |
%61 = sext i32 %.07.i to i64 | |
%62 = getelementptr inbounds float, float* %.sroa.9.0.copyload, i64 %61 | |
store float 0.000000e+00, float* %62, align 4 | |
%63 = add nsw i32 %.07.i, %9 | |
%64 = icmp slt i32 %63, %1 | |
br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63 | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63: ; preds = %.lr.ph.split.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit63, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESK_iii.exit.loopexit, %2 | |
ret void | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 32 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %3, 32767 | |
%31 = sdiv i32 %30, 32768 | |
%32 = mul nsw i32 %31, %2 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = icmp slt i32 %33, %32 | |
br i1 %35, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%36 = and i32 %34, 31 | |
%.not = icmp ne i32 %36, 0 | |
%37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0 | |
%38 = load float*, float** %37, align 8 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0 | |
%40 = load float*, float** %39, align 8 | |
br label %41 | |
._crit_edge.loopexit: ; preds = %187 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %29 | |
ret void | |
; <label>:41: ; preds = %.lr.ph, %187 | |
%.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ] | |
%42 = srem i32 %.0114, %31 | |
%43 = sdiv i32 %.0114, %31 | |
%44 = shl nsw i32 %42, 15 | |
%45 = or i32 %44, %34 | |
%46 = icmp slt i32 %43, %2 | |
br i1 %46, label %.preheader102, label %.thread.preheader | |
.thread.preheader.loopexit: ; preds = %.preheader.preheader | |
%.lcssa137 = phi float [ %164, %.preheader.preheader ] | |
br label %.thread.preheader | |
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41 | |
%.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ] | |
br label %.thread | |
.preheader102: ; preds = %41 | |
%47 = mul nsw i32 %43, %3 | |
%48 = add i32 %47, %45 | |
br label %49 | |
; <label>:49: ; preds = %.preheader102, %.preheader.preheader | |
%.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ] | |
%.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ] | |
%50 = add nuw nsw i32 %.098108, 16 | |
%51 = shl i32 %.098108, 8 | |
%52 = or i32 %51, 3840 | |
%53 = add nsw i32 %52, %45 | |
%54 = icmp slt i32 %53, %3 | |
br i1 %54, label %.preheader.preheader, label %.preheader101 | |
.preheader.preheader: ; preds = %49 | |
%55 = add i32 %48, %51 | |
%56 = sext i32 %55 to i64 | |
%57 = getelementptr inbounds float, float* %40, i64 %56 | |
%58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8 | |
%59 = fadd float %.095109, %58 | |
%60 = shl i32 %.098108, 8 | |
%61 = or i32 %60, 256 | |
%62 = add i32 %48, %61 | |
%63 = sext i32 %62 to i64 | |
%64 = getelementptr inbounds float, float* %40, i64 %63 | |
%65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8 | |
%66 = fadd float %59, %65 | |
%67 = shl i32 %.098108, 8 | |
%68 = or i32 %67, 512 | |
%69 = add i32 %48, %68 | |
%70 = sext i32 %69 to i64 | |
%71 = getelementptr inbounds float, float* %40, i64 %70 | |
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8 | |
%73 = fadd float %66, %72 | |
%74 = shl i32 %.098108, 8 | |
%75 = or i32 %74, 768 | |
%76 = add i32 %48, %75 | |
%77 = sext i32 %76 to i64 | |
%78 = getelementptr inbounds float, float* %40, i64 %77 | |
%79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8 | |
%80 = fadd float %73, %79 | |
%81 = shl i32 %.098108, 8 | |
%82 = or i32 %81, 1024 | |
%83 = add i32 %48, %82 | |
%84 = sext i32 %83 to i64 | |
%85 = getelementptr inbounds float, float* %40, i64 %84 | |
%86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8 | |
%87 = fadd float %80, %86 | |
%88 = shl i32 %.098108, 8 | |
%89 = or i32 %88, 1280 | |
%90 = add i32 %48, %89 | |
%91 = sext i32 %90 to i64 | |
%92 = getelementptr inbounds float, float* %40, i64 %91 | |
%93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8 | |
%94 = fadd float %87, %93 | |
%95 = shl i32 %.098108, 8 | |
%96 = or i32 %95, 1536 | |
%97 = add i32 %48, %96 | |
%98 = sext i32 %97 to i64 | |
%99 = getelementptr inbounds float, float* %40, i64 %98 | |
%100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8 | |
%101 = fadd float %94, %100 | |
%102 = shl i32 %.098108, 8 | |
%103 = or i32 %102, 1792 | |
%104 = add i32 %48, %103 | |
%105 = sext i32 %104 to i64 | |
%106 = getelementptr inbounds float, float* %40, i64 %105 | |
%107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8 | |
%108 = fadd float %101, %107 | |
%109 = shl i32 %.098108, 8 | |
%110 = or i32 %109, 2048 | |
%111 = add i32 %48, %110 | |
%112 = sext i32 %111 to i64 | |
%113 = getelementptr inbounds float, float* %40, i64 %112 | |
%114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8 | |
%115 = fadd float %108, %114 | |
%116 = shl i32 %.098108, 8 | |
%117 = or i32 %116, 2304 | |
%118 = add i32 %48, %117 | |
%119 = sext i32 %118 to i64 | |
%120 = getelementptr inbounds float, float* %40, i64 %119 | |
%121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8 | |
%122 = fadd float %115, %121 | |
%123 = shl i32 %.098108, 8 | |
%124 = or i32 %123, 2560 | |
%125 = add i32 %48, %124 | |
%126 = sext i32 %125 to i64 | |
%127 = getelementptr inbounds float, float* %40, i64 %126 | |
%128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8 | |
%129 = fadd float %122, %128 | |
%130 = shl i32 %.098108, 8 | |
%131 = or i32 %130, 2816 | |
%132 = add i32 %48, %131 | |
%133 = sext i32 %132 to i64 | |
%134 = getelementptr inbounds float, float* %40, i64 %133 | |
%135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8 | |
%136 = fadd float %129, %135 | |
%137 = shl i32 %.098108, 8 | |
%138 = or i32 %137, 3072 | |
%139 = add i32 %48, %138 | |
%140 = sext i32 %139 to i64 | |
%141 = getelementptr inbounds float, float* %40, i64 %140 | |
%142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8 | |
%143 = fadd float %136, %142 | |
%144 = shl i32 %.098108, 8 | |
%145 = or i32 %144, 3328 | |
%146 = add i32 %48, %145 | |
%147 = sext i32 %146 to i64 | |
%148 = getelementptr inbounds float, float* %40, i64 %147 | |
%149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8 | |
%150 = fadd float %143, %149 | |
%151 = shl i32 %.098108, 8 | |
%152 = or i32 %151, 3584 | |
%153 = add i32 %48, %152 | |
%154 = sext i32 %153 to i64 | |
%155 = getelementptr inbounds float, float* %40, i64 %154 | |
%156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8 | |
%157 = fadd float %150, %156 | |
%158 = shl i32 %.098108, 8 | |
%159 = or i32 %158, 3840 | |
%160 = add i32 %48, %159 | |
%161 = sext i32 %160 to i64 | |
%162 = getelementptr inbounds float, float* %40, i64 %161 | |
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8 | |
%164 = fadd float %157, %163 | |
%165 = icmp slt i32 %50, 128 | |
br i1 %165, label %49, label %.thread.preheader.loopexit | |
.preheader101: ; preds = %49 | |
%.lcssa = phi i32 [ %51, %49 ] | |
%.098108.lcssa = phi i32 [ %.098108, %49 ] | |
%.095109.lcssa = phi float [ %.095109, %49 ] | |
%166 = add nsw i32 %.lcssa, %45 | |
%167 = icmp slt i32 %166, %3 | |
br i1 %167, label %168, label %.thread.preheader | |
; <label>:168: ; preds = %.preheader101 | |
%169 = add nsw i32 %166, %47 | |
%170 = sext i32 %169 to i64 | |
%171 = getelementptr inbounds float, float* %40, i64 %170 | |
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8 | |
%173 = fadd float %.095109.lcssa, %172 | |
%174 = shl i32 %.098108.lcssa, 8 | |
%175 = or i32 %174, 256 | |
%176 = add nsw i32 %175, %45 | |
%177 = icmp slt i32 %176, %3 | |
br i1 %177, label %190, label %.thread.preheader | |
; <label>:178: ; preds = %.thread | |
%.lcssa138 = phi float [ %180, %.thread ] | |
%.not99 = xor i1 %46, true | |
%brmerge = or i1 %.not, %.not99 | |
br i1 %brmerge, label %187, label %183 | |
.thread: ; preds = %.thread.preheader, %.thread | |
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ] | |
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ] | |
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53 | |
%180 = fadd float %.8112, %179 | |
%181 = lshr i32 %.092113, 1 | |
%182 = icmp eq i32 %181, 0 | |
br i1 %182, label %178, label %.thread, !llvm.loop !69 | |
; <label>:183: ; preds = %178 | |
%184 = sext i32 %43 to i64 | |
%185 = getelementptr inbounds float, float* %38, i64 %184 | |
%186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8 | |
br label %187 | |
; <label>:187: ; preds = %178, %183 | |
%188 = add nuw nsw i32 %.0114, 32 | |
%189 = icmp slt i32 %188, %32 | |
br i1 %189, label %41, label %._crit_edge.loopexit | |
; <label>:190: ; preds = %168 | |
%191 = add nsw i32 %176, %47 | |
%192 = sext i32 %191 to i64 | |
%193 = getelementptr inbounds float, float* %40, i64 %192 | |
%194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8 | |
%195 = fadd float %173, %194 | |
%196 = shl i32 %.098108.lcssa, 8 | |
%197 = or i32 %196, 512 | |
%198 = add nsw i32 %197, %45 | |
%199 = icmp slt i32 %198, %3 | |
br i1 %199, label %200, label %.thread.preheader | |
; <label>:200: ; preds = %190 | |
%201 = add nsw i32 %198, %47 | |
%202 = sext i32 %201 to i64 | |
%203 = getelementptr inbounds float, float* %40, i64 %202 | |
%204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8 | |
%205 = fadd float %195, %204 | |
%206 = shl i32 %.098108.lcssa, 8 | |
%207 = or i32 %206, 768 | |
%208 = add nsw i32 %207, %45 | |
%209 = icmp slt i32 %208, %3 | |
br i1 %209, label %210, label %.thread.preheader | |
; <label>:210: ; preds = %200 | |
%211 = add nsw i32 %208, %47 | |
%212 = sext i32 %211 to i64 | |
%213 = getelementptr inbounds float, float* %40, i64 %212 | |
%214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8 | |
%215 = fadd float %205, %214 | |
%216 = shl i32 %.098108.lcssa, 8 | |
%217 = or i32 %216, 1024 | |
%218 = add nsw i32 %217, %45 | |
%219 = icmp slt i32 %218, %3 | |
br i1 %219, label %220, label %.thread.preheader | |
; <label>:220: ; preds = %210 | |
%221 = add nsw i32 %218, %47 | |
%222 = sext i32 %221 to i64 | |
%223 = getelementptr inbounds float, float* %40, i64 %222 | |
%224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8 | |
%225 = fadd float %215, %224 | |
%226 = shl i32 %.098108.lcssa, 8 | |
%227 = or i32 %226, 1280 | |
%228 = add nsw i32 %227, %45 | |
%229 = icmp slt i32 %228, %3 | |
br i1 %229, label %230, label %.thread.preheader | |
; <label>:230: ; preds = %220 | |
%231 = add nsw i32 %228, %47 | |
%232 = sext i32 %231 to i64 | |
%233 = getelementptr inbounds float, float* %40, i64 %232 | |
%234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8 | |
%235 = fadd float %225, %234 | |
%236 = shl i32 %.098108.lcssa, 8 | |
%237 = or i32 %236, 1536 | |
%238 = add nsw i32 %237, %45 | |
%239 = icmp slt i32 %238, %3 | |
br i1 %239, label %240, label %.thread.preheader | |
; <label>:240: ; preds = %230 | |
%241 = add nsw i32 %238, %47 | |
%242 = sext i32 %241 to i64 | |
%243 = getelementptr inbounds float, float* %40, i64 %242 | |
%244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8 | |
%245 = fadd float %235, %244 | |
%246 = shl i32 %.098108.lcssa, 8 | |
%247 = or i32 %246, 1792 | |
%248 = add nsw i32 %247, %45 | |
%249 = icmp slt i32 %248, %3 | |
br i1 %249, label %250, label %.thread.preheader | |
; <label>:250: ; preds = %240 | |
%251 = add nsw i32 %248, %47 | |
%252 = sext i32 %251 to i64 | |
%253 = getelementptr inbounds float, float* %40, i64 %252 | |
%254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8 | |
%255 = fadd float %245, %254 | |
%256 = shl i32 %.098108.lcssa, 8 | |
%257 = or i32 %256, 2048 | |
%258 = add nsw i32 %257, %45 | |
%259 = icmp slt i32 %258, %3 | |
br i1 %259, label %260, label %.thread.preheader | |
; <label>:260: ; preds = %250 | |
%261 = add nsw i32 %258, %47 | |
%262 = sext i32 %261 to i64 | |
%263 = getelementptr inbounds float, float* %40, i64 %262 | |
%264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8 | |
%265 = fadd float %255, %264 | |
%266 = shl i32 %.098108.lcssa, 8 | |
%267 = or i32 %266, 2304 | |
%268 = add nsw i32 %267, %45 | |
%269 = icmp slt i32 %268, %3 | |
br i1 %269, label %270, label %.thread.preheader | |
; <label>:270: ; preds = %260 | |
%271 = add nsw i32 %268, %47 | |
%272 = sext i32 %271 to i64 | |
%273 = getelementptr inbounds float, float* %40, i64 %272 | |
%274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8 | |
%275 = fadd float %265, %274 | |
%276 = shl i32 %.098108.lcssa, 8 | |
%277 = or i32 %276, 2560 | |
%278 = add nsw i32 %277, %45 | |
%279 = icmp slt i32 %278, %3 | |
br i1 %279, label %280, label %.thread.preheader | |
; <label>:280: ; preds = %270 | |
%281 = add nsw i32 %278, %47 | |
%282 = sext i32 %281 to i64 | |
%283 = getelementptr inbounds float, float* %40, i64 %282 | |
%284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8 | |
%285 = fadd float %275, %284 | |
%286 = shl i32 %.098108.lcssa, 8 | |
%287 = or i32 %286, 2816 | |
%288 = add nsw i32 %287, %45 | |
%289 = icmp slt i32 %288, %3 | |
br i1 %289, label %290, label %.thread.preheader | |
; <label>:290: ; preds = %280 | |
%291 = add nsw i32 %288, %47 | |
%292 = sext i32 %291 to i64 | |
%293 = getelementptr inbounds float, float* %40, i64 %292 | |
%294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8 | |
%295 = fadd float %285, %294 | |
%296 = shl i32 %.098108.lcssa, 8 | |
%297 = or i32 %296, 3072 | |
%298 = add nsw i32 %297, %45 | |
%299 = icmp slt i32 %298, %3 | |
br i1 %299, label %300, label %.thread.preheader | |
; <label>:300: ; preds = %290 | |
%301 = add nsw i32 %298, %47 | |
%302 = sext i32 %301 to i64 | |
%303 = getelementptr inbounds float, float* %40, i64 %302 | |
%304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8 | |
%305 = fadd float %295, %304 | |
%306 = shl i32 %.098108.lcssa, 8 | |
%307 = or i32 %306, 3328 | |
%308 = add nsw i32 %307, %45 | |
%309 = icmp slt i32 %308, %3 | |
br i1 %309, label %310, label %.thread.preheader | |
; <label>:310: ; preds = %300 | |
%311 = add nsw i32 %308, %47 | |
%312 = sext i32 %311 to i64 | |
%313 = getelementptr inbounds float, float* %40, i64 %312 | |
%314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8 | |
%315 = fadd float %305, %314 | |
%316 = shl i32 %.098108.lcssa, 8 | |
%317 = or i32 %316, 3584 | |
%318 = add nsw i32 %317, %45 | |
%319 = icmp slt i32 %318, %3 | |
br i1 %319, label %320, label %.thread.preheader | |
; <label>:320: ; preds = %310 | |
%321 = add nsw i32 %318, %47 | |
%322 = sext i32 %321 to i64 | |
%323 = getelementptr inbounds float, float* %40, i64 %322 | |
%324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8 | |
%325 = fadd float %315, %324 | |
br label %.thread.preheader | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 32 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([382 x i8], [382 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %3, 32767 | |
%31 = sdiv i32 %30, 32768 | |
%32 = mul nsw i32 %31, %2 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = icmp slt i32 %33, %32 | |
br i1 %35, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0 | |
%36 = and i32 %34, 31 | |
%.not = icmp ne i32 %36, 0 | |
%37 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0 | |
%38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0 | |
br label %39 | |
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %29 | |
ret void | |
; <label>:39: ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
%.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ] | |
%40 = srem i32 %.0114, %31 | |
%41 = sdiv i32 %.0114, %31 | |
%42 = shl nsw i32 %40, 15 | |
%43 = or i32 %42, %34 | |
%.idx.val = load float, float* %.idx, align 4 | |
%44 = icmp slt i32 %41, %2 | |
br i1 %44, label %.preheader102, label %.thread.preheader | |
.thread.preheader.loopexit: ; preds = %.preheader.preheader | |
%.lcssa137 = phi float [ %163, %.preheader.preheader ] | |
br label %.thread.preheader | |
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39 | |
%.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ] | |
br label %.thread | |
.preheader102: ; preds = %39 | |
%45 = mul nsw i32 %41, %3 | |
%46 = add i32 %45, %43 | |
%47 = load float*, float** %38, align 8 | |
br label %48 | |
; <label>:48: ; preds = %.preheader102, %.preheader.preheader | |
%.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ] | |
%.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ] | |
%49 = add nuw nsw i32 %.098108, 16 | |
%50 = shl i32 %.098108, 8 | |
%51 = or i32 %50, 3840 | |
%52 = add nsw i32 %51, %43 | |
%53 = icmp slt i32 %52, %3 | |
br i1 %53, label %.preheader.preheader, label %.preheader101 | |
.preheader.preheader: ; preds = %48 | |
%54 = add i32 %46, %50 | |
%55 = sext i32 %54 to i64 | |
%56 = getelementptr inbounds float, float* %47, i64 %55 | |
%57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8 | |
%58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8 | |
%59 = shl i32 %.098108, 8 | |
%60 = or i32 %59, 256 | |
%61 = add i32 %46, %60 | |
%62 = sext i32 %61 to i64 | |
%63 = getelementptr inbounds float, float* %47, i64 %62 | |
%64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8 | |
%65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8 | |
%66 = shl i32 %.098108, 8 | |
%67 = or i32 %66, 512 | |
%68 = add i32 %46, %67 | |
%69 = sext i32 %68 to i64 | |
%70 = getelementptr inbounds float, float* %47, i64 %69 | |
%71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8 | |
%72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8 | |
%73 = shl i32 %.098108, 8 | |
%74 = or i32 %73, 768 | |
%75 = add i32 %46, %74 | |
%76 = sext i32 %75 to i64 | |
%77 = getelementptr inbounds float, float* %47, i64 %76 | |
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8 | |
%79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8 | |
%80 = shl i32 %.098108, 8 | |
%81 = or i32 %80, 1024 | |
%82 = add i32 %46, %81 | |
%83 = sext i32 %82 to i64 | |
%84 = getelementptr inbounds float, float* %47, i64 %83 | |
%85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8 | |
%86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8 | |
%87 = shl i32 %.098108, 8 | |
%88 = or i32 %87, 1280 | |
%89 = add i32 %46, %88 | |
%90 = sext i32 %89 to i64 | |
%91 = getelementptr inbounds float, float* %47, i64 %90 | |
%92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8 | |
%93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8 | |
%94 = shl i32 %.098108, 8 | |
%95 = or i32 %94, 1536 | |
%96 = add i32 %46, %95 | |
%97 = sext i32 %96 to i64 | |
%98 = getelementptr inbounds float, float* %47, i64 %97 | |
%99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8 | |
%100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8 | |
%101 = shl i32 %.098108, 8 | |
%102 = or i32 %101, 1792 | |
%103 = add i32 %46, %102 | |
%104 = sext i32 %103 to i64 | |
%105 = getelementptr inbounds float, float* %47, i64 %104 | |
%106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8 | |
%107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8 | |
%108 = shl i32 %.098108, 8 | |
%109 = or i32 %108, 2048 | |
%110 = add i32 %46, %109 | |
%111 = sext i32 %110 to i64 | |
%112 = getelementptr inbounds float, float* %47, i64 %111 | |
%113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8 | |
%114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8 | |
%115 = shl i32 %.098108, 8 | |
%116 = or i32 %115, 2304 | |
%117 = add i32 %46, %116 | |
%118 = sext i32 %117 to i64 | |
%119 = getelementptr inbounds float, float* %47, i64 %118 | |
%120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8 | |
%121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8 | |
%122 = shl i32 %.098108, 8 | |
%123 = or i32 %122, 2560 | |
%124 = add i32 %46, %123 | |
%125 = sext i32 %124 to i64 | |
%126 = getelementptr inbounds float, float* %47, i64 %125 | |
%127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8 | |
%128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8 | |
%129 = shl i32 %.098108, 8 | |
%130 = or i32 %129, 2816 | |
%131 = add i32 %46, %130 | |
%132 = sext i32 %131 to i64 | |
%133 = getelementptr inbounds float, float* %47, i64 %132 | |
%134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8 | |
%135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8 | |
%136 = shl i32 %.098108, 8 | |
%137 = or i32 %136, 3072 | |
%138 = add i32 %46, %137 | |
%139 = sext i32 %138 to i64 | |
%140 = getelementptr inbounds float, float* %47, i64 %139 | |
%141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8 | |
%142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8 | |
%143 = shl i32 %.098108, 8 | |
%144 = or i32 %143, 3328 | |
%145 = add i32 %46, %144 | |
%146 = sext i32 %145 to i64 | |
%147 = getelementptr inbounds float, float* %47, i64 %146 | |
%148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8 | |
%149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8 | |
%150 = shl i32 %.098108, 8 | |
%151 = or i32 %150, 3584 | |
%152 = add i32 %46, %151 | |
%153 = sext i32 %152 to i64 | |
%154 = getelementptr inbounds float, float* %47, i64 %153 | |
%155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8 | |
%156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8 | |
%157 = shl i32 %.098108, 8 | |
%158 = or i32 %157, 3840 | |
%159 = add i32 %46, %158 | |
%160 = sext i32 %159 to i64 | |
%161 = getelementptr inbounds float, float* %47, i64 %160 | |
%162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8 | |
%163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8 | |
%164 = icmp slt i32 %49, 128 | |
br i1 %164, label %48, label %.thread.preheader.loopexit | |
.preheader101: ; preds = %48 | |
%.lcssa = phi i32 [ %50, %48 ] | |
%.098108.lcssa = phi i32 [ %.098108, %48 ] | |
%.095109.lcssa = phi float [ %.095109, %48 ] | |
%165 = load float*, float** %38, align 8 | |
%166 = add nsw i32 %.lcssa, %43 | |
%167 = icmp slt i32 %166, %3 | |
br i1 %167, label %168, label %.thread.preheader | |
; <label>:168: ; preds = %.preheader101 | |
%169 = add nsw i32 %166, %45 | |
%170 = sext i32 %169 to i64 | |
%171 = getelementptr inbounds float, float* %165, i64 %170 | |
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8 | |
%173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8 | |
%174 = shl i32 %.098108.lcssa, 8 | |
%175 = or i32 %174, 256 | |
%176 = add nsw i32 %175, %43 | |
%177 = icmp slt i32 %176, %3 | |
br i1 %177, label %198, label %.thread.preheader | |
; <label>:178: ; preds = %.thread | |
%.lcssa138 = phi float [ %180, %.thread ] | |
%.not99 = xor i1 %44, true | |
%brmerge = or i1 %.not, %.not99 | |
br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183 | |
.thread: ; preds = %.thread.preheader, %.thread | |
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ] | |
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ] | |
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53 | |
%180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8 | |
%181 = lshr i32 %.092113, 1 | |
%182 = icmp eq i32 %181, 0 | |
br i1 %182, label %178, label %.thread, !llvm.loop !70 | |
; <label>:183: ; preds = %178 | |
%184 = load float*, float** %37, align 8 | |
%185 = sext i32 %41 to i64 | |
%186 = getelementptr inbounds float, float* %184, i64 %185 | |
%187 = bitcast float %.lcssa138 to i32 | |
%188 = bitcast float* %186 to i32* | |
%189 = load i32, i32* %188, align 4 | |
br label %190 | |
; <label>:190: ; preds = %193, %183 | |
%.011.i = phi i32 [ %189, %183 ], [ %195, %193 ] | |
%191 = bitcast i32 %.011.i to float | |
%192 = fcmp olt float %191, %.lcssa138 | |
br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit | |
; <label>:193: ; preds = %190 | |
%194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst | |
%195 = extractvalue { i32, i1 } %194, 0 | |
%not..i = icmp eq i32 %.011.i, %195 | |
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193 | |
br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178 | |
%196 = add nuw nsw i32 %.0114, 32 | |
%197 = icmp slt i32 %196, %32 | |
br i1 %197, label %39, label %._crit_edge.loopexit | |
; <label>:198: ; preds = %168 | |
%199 = add nsw i32 %176, %45 | |
%200 = sext i32 %199 to i64 | |
%201 = getelementptr inbounds float, float* %165, i64 %200 | |
%202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8 | |
%203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8 | |
%204 = shl i32 %.098108.lcssa, 8 | |
%205 = or i32 %204, 512 | |
%206 = add nsw i32 %205, %43 | |
%207 = icmp slt i32 %206, %3 | |
br i1 %207, label %208, label %.thread.preheader | |
; <label>:208: ; preds = %198 | |
%209 = add nsw i32 %206, %45 | |
%210 = sext i32 %209 to i64 | |
%211 = getelementptr inbounds float, float* %165, i64 %210 | |
%212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8 | |
%213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8 | |
%214 = shl i32 %.098108.lcssa, 8 | |
%215 = or i32 %214, 768 | |
%216 = add nsw i32 %215, %43 | |
%217 = icmp slt i32 %216, %3 | |
br i1 %217, label %218, label %.thread.preheader | |
; <label>:218: ; preds = %208 | |
%219 = add nsw i32 %216, %45 | |
%220 = sext i32 %219 to i64 | |
%221 = getelementptr inbounds float, float* %165, i64 %220 | |
%222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8 | |
%223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8 | |
%224 = shl i32 %.098108.lcssa, 8 | |
%225 = or i32 %224, 1024 | |
%226 = add nsw i32 %225, %43 | |
%227 = icmp slt i32 %226, %3 | |
br i1 %227, label %228, label %.thread.preheader | |
; <label>:228: ; preds = %218 | |
%229 = add nsw i32 %226, %45 | |
%230 = sext i32 %229 to i64 | |
%231 = getelementptr inbounds float, float* %165, i64 %230 | |
%232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8 | |
%233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8 | |
%234 = shl i32 %.098108.lcssa, 8 | |
%235 = or i32 %234, 1280 | |
%236 = add nsw i32 %235, %43 | |
%237 = icmp slt i32 %236, %3 | |
br i1 %237, label %238, label %.thread.preheader | |
; <label>:238: ; preds = %228 | |
%239 = add nsw i32 %236, %45 | |
%240 = sext i32 %239 to i64 | |
%241 = getelementptr inbounds float, float* %165, i64 %240 | |
%242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8 | |
%243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8 | |
%244 = shl i32 %.098108.lcssa, 8 | |
%245 = or i32 %244, 1536 | |
%246 = add nsw i32 %245, %43 | |
%247 = icmp slt i32 %246, %3 | |
br i1 %247, label %248, label %.thread.preheader | |
; <label>:248: ; preds = %238 | |
%249 = add nsw i32 %246, %45 | |
%250 = sext i32 %249 to i64 | |
%251 = getelementptr inbounds float, float* %165, i64 %250 | |
%252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8 | |
%253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8 | |
%254 = shl i32 %.098108.lcssa, 8 | |
%255 = or i32 %254, 1792 | |
%256 = add nsw i32 %255, %43 | |
%257 = icmp slt i32 %256, %3 | |
br i1 %257, label %258, label %.thread.preheader | |
; <label>:258: ; preds = %248 | |
%259 = add nsw i32 %256, %45 | |
%260 = sext i32 %259 to i64 | |
%261 = getelementptr inbounds float, float* %165, i64 %260 | |
%262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8 | |
%263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8 | |
%264 = shl i32 %.098108.lcssa, 8 | |
%265 = or i32 %264, 2048 | |
%266 = add nsw i32 %265, %43 | |
%267 = icmp slt i32 %266, %3 | |
br i1 %267, label %268, label %.thread.preheader | |
; <label>:268: ; preds = %258 | |
%269 = add nsw i32 %266, %45 | |
%270 = sext i32 %269 to i64 | |
%271 = getelementptr inbounds float, float* %165, i64 %270 | |
%272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8 | |
%273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8 | |
%274 = shl i32 %.098108.lcssa, 8 | |
%275 = or i32 %274, 2304 | |
%276 = add nsw i32 %275, %43 | |
%277 = icmp slt i32 %276, %3 | |
br i1 %277, label %278, label %.thread.preheader | |
; <label>:278: ; preds = %268 | |
%279 = add nsw i32 %276, %45 | |
%280 = sext i32 %279 to i64 | |
%281 = getelementptr inbounds float, float* %165, i64 %280 | |
%282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8 | |
%283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8 | |
%284 = shl i32 %.098108.lcssa, 8 | |
%285 = or i32 %284, 2560 | |
%286 = add nsw i32 %285, %43 | |
%287 = icmp slt i32 %286, %3 | |
br i1 %287, label %288, label %.thread.preheader | |
; <label>:288: ; preds = %278 | |
%289 = add nsw i32 %286, %45 | |
%290 = sext i32 %289 to i64 | |
%291 = getelementptr inbounds float, float* %165, i64 %290 | |
%292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8 | |
%293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8 | |
%294 = shl i32 %.098108.lcssa, 8 | |
%295 = or i32 %294, 2816 | |
%296 = add nsw i32 %295, %43 | |
%297 = icmp slt i32 %296, %3 | |
br i1 %297, label %298, label %.thread.preheader | |
; <label>:298: ; preds = %288 | |
%299 = add nsw i32 %296, %45 | |
%300 = sext i32 %299 to i64 | |
%301 = getelementptr inbounds float, float* %165, i64 %300 | |
%302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8 | |
%303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8 | |
%304 = shl i32 %.098108.lcssa, 8 | |
%305 = or i32 %304, 3072 | |
%306 = add nsw i32 %305, %43 | |
%307 = icmp slt i32 %306, %3 | |
br i1 %307, label %308, label %.thread.preheader | |
; <label>:308: ; preds = %298 | |
%309 = add nsw i32 %306, %45 | |
%310 = sext i32 %309 to i64 | |
%311 = getelementptr inbounds float, float* %165, i64 %310 | |
%312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8 | |
%313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8 | |
%314 = shl i32 %.098108.lcssa, 8 | |
%315 = or i32 %314, 3328 | |
%316 = add nsw i32 %315, %43 | |
%317 = icmp slt i32 %316, %3 | |
br i1 %317, label %318, label %.thread.preheader | |
; <label>:318: ; preds = %308 | |
%319 = add nsw i32 %316, %45 | |
%320 = sext i32 %319 to i64 | |
%321 = getelementptr inbounds float, float* %165, i64 %320 | |
%322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8 | |
%323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8 | |
%324 = shl i32 %.098108.lcssa, 8 | |
%325 = or i32 %324, 3584 | |
%326 = add nsw i32 %325, %43 | |
%327 = icmp slt i32 %326, %3 | |
br i1 %327, label %328, label %.thread.preheader | |
; <label>:328: ; preds = %318 | |
%329 = add nsw i32 %326, %45 | |
%330 = sext i32 %329 to i64 | |
%331 = getelementptr inbounds float, float* %165, i64 %330 | |
%332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8 | |
%333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8 | |
br label %.thread.preheader | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 128 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %2, 15 | |
%31 = sdiv i32 %30, 16 | |
%32 = mul nsw i32 %31, %3 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = shl nuw nsw i32 %33, 8 | |
%36 = add nuw nsw i32 %35, %34 | |
%37 = icmp slt i32 %36, %32 | |
br i1 %37, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%38 = icmp sgt i32 %3, -1 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0 | |
%40 = load float*, float** %39, align 8 | |
%41 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0 | |
%42 = load float*, float** %41, align 8 | |
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader | |
.lr.ph.split.preheader: ; preds = %.lr.ph | |
%43 = add i32 %32, -1 | |
%44 = sub i32 %43, %34 | |
%45 = sub i32 %44, %35 | |
%46 = lshr i32 %45, 15 | |
%47 = add nuw nsw i32 %46, 1 | |
%xtraiter = and i32 %47, 3 | |
%48 = icmp ult i32 %45, 98304 | |
br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new | |
.lr.ph.split.preheader.new: ; preds = %.lr.ph.split.preheader | |
%unroll_iter = sub nsw i32 %47, %xtraiter | |
br label %.lr.ph.split | |
.lr.ph.split.us.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split.us | |
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us | |
%.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ] | |
%49 = srem i32 %.047.us, %3 | |
%50 = sdiv i32 %.047.us, %3 | |
%51 = srem i32 %50, %31 | |
%52 = shl nsw i32 %51, 4 | |
br label %53 | |
; <label>:53: ; preds = %104, %.lr.ph.split.us | |
%.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ] | |
%.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ] | |
%54 = add nuw nsw i32 %.04346.us.us, %52 | |
%55 = icmp slt i32 %54, %2 | |
br i1 %55, label %56, label %62 | |
; <label>:56: ; preds = %53 | |
%57 = mul nsw i32 %54, %3 | |
%58 = add nsw i32 %57, %49 | |
%59 = sext i32 %58 to i64 | |
%60 = getelementptr inbounds float, float* %40, i64 %59 | |
%61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8 | |
br label %62 | |
; <label>:62: ; preds = %56, %53 | |
%63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ] | |
%64 = fadd float %.04445.us.us, %63 | |
%65 = or i32 %.04346.us.us, 1 | |
%66 = add nuw nsw i32 %65, %52 | |
%67 = icmp slt i32 %66, %2 | |
br i1 %67, label %98, label %104 | |
.us-lcssa.us.us: ; preds = %104 | |
%.lcssa = phi float [ %106, %104 ] | |
%68 = sext i32 %49 to i64 | |
%69 = getelementptr inbounds float, float* %42, i64 %68 | |
%70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8 | |
%71 = add nuw nsw i32 %.047.us, 32768 | |
%72 = icmp slt i32 %71, %32 | |
br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit | |
._crit_edge.loopexit: ; preds = %.us-lcssa.us.us | |
br label %._crit_edge | |
._crit_edge.loopexit59.unr-lcssa.loopexit: ; preds = %.lr.ph.split | |
%.lcssa60 = phi i32 [ %97, %.lr.ph.split ] | |
br label %._crit_edge.loopexit59.unr-lcssa | |
._crit_edge.loopexit59.unr-lcssa: ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader | |
%.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ] | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader | |
.lr.ph.split.epil.preheader: ; preds = %._crit_edge.loopexit59.unr-lcssa | |
br label %.lr.ph.split.epil | |
.lr.ph.split.epil: ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader | |
%.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ] | |
%epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ] | |
%73 = srem i32 %.047.epil, %3 | |
%74 = sext i32 %73 to i64 | |
%75 = getelementptr inbounds float, float* %42, i64 %74 | |
%76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8 | |
%77 = add nuw nsw i32 %.047.epil, 32768 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !71 | |
._crit_edge.loopexit59.epilog-lcssa: ; preds = %.lr.ph.split.epil | |
br label %._crit_edge.loopexit59 | |
._crit_edge.loopexit59: ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29 | |
ret void | |
.lr.ph.split: ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new | |
%.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ] | |
%78 = srem i32 %.047, %3 | |
%79 = sext i32 %78 to i64 | |
%80 = getelementptr inbounds float, float* %42, i64 %79 | |
%81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8 | |
%82 = add nuw nsw i32 %.047, 32768 | |
%83 = srem i32 %82, %3 | |
%84 = sext i32 %83 to i64 | |
%85 = getelementptr inbounds float, float* %42, i64 %84 | |
%86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8 | |
%87 = add nsw i32 %.047, 65536 | |
%88 = srem i32 %87, %3 | |
%89 = sext i32 %88 to i64 | |
%90 = getelementptr inbounds float, float* %42, i64 %89 | |
%91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8 | |
%92 = add nsw i32 %.047, 98304 | |
%93 = srem i32 %92, %3 | |
%94 = sext i32 %93 to i64 | |
%95 = getelementptr inbounds float, float* %42, i64 %94 | |
%96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8 | |
%97 = add nsw i32 %.047, 131072 | |
%niter.nsub.3 = add i32 %niter, -4 | |
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 | |
br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split | |
; <label>:98: ; preds = %62 | |
%99 = mul nsw i32 %66, %3 | |
%100 = add nsw i32 %99, %49 | |
%101 = sext i32 %100 to i64 | |
%102 = getelementptr inbounds float, float* %40, i64 %101 | |
%103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8 | |
br label %104 | |
; <label>:104: ; preds = %98, %62 | |
%105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ] | |
%106 = fadd float %64, %105 | |
%107 = add nsw i32 %.04346.us.us, 2 | |
%exitcond.1 = icmp eq i32 %107, 16 | |
br i1 %exitcond.1, label %.us-lcssa.us.us, label %53 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::internal::PtrWrapper"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 128 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([385 x i8], [385 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %2, 15 | |
%31 = sdiv i32 %30, 16 | |
%32 = mul nsw i32 %31, %3 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = shl nuw nsw i32 %33, 8 | |
%36 = add nuw nsw i32 %35, %34 | |
%37 = icmp slt i32 %36, %32 | |
br i1 %37, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0 | |
%38 = icmp sgt i32 %3, -1 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0 | |
%40 = getelementptr inbounds %"struct.Eigen::internal::PtrWrapper", %"struct.Eigen::internal::PtrWrapper"* %4, i64 0, i32 0 | |
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader | |
.lr.ph.split.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split | |
.lr.ph.split.us.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split.us | |
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
%.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ] | |
%41 = srem i32 %.048.us, %3 | |
%42 = sdiv i32 %.048.us, %3 | |
%43 = srem i32 %42, %31 | |
%44 = shl nsw i32 %43, 4 | |
%.idx45.val.us = load float, float* %.idx45, align 4 | |
%45 = load float*, float** %39, align 8 | |
br label %54 | |
; <label>:46: ; preds = %49, %.us-lcssa.us.us | |
%.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ] | |
%47 = bitcast i32 %.011.i.us to float | |
%48 = fcmp olt float %47, %.lcssa | |
br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
; <label>:49: ; preds = %46 | |
%50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst | |
%51 = extractvalue { i32, i1 } %50, 0 | |
%not..i.us = icmp eq i32 %.011.i.us, %51 | |
br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46 | |
%52 = add nuw nsw i32 %.048.us, 32768 | |
%53 = icmp slt i32 %52, %32 | |
br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit | |
; <label>:54: ; preds = %112, %.lr.ph.split.us | |
%.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ] | |
%.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ] | |
%55 = add nuw nsw i32 %.04347.us.us, %44 | |
%56 = icmp slt i32 %55, %2 | |
br i1 %56, label %57, label %63 | |
; <label>:57: ; preds = %54 | |
%58 = mul nsw i32 %55, %3 | |
%59 = add nsw i32 %58, %41 | |
%60 = sext i32 %59 to i64 | |
%61 = getelementptr inbounds float, float* %45, i64 %60 | |
%62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8 | |
br label %63 | |
; <label>:63: ; preds = %54, %57 | |
%64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ] | |
%65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8 | |
%66 = or i32 %.04347.us.us, 1 | |
%67 = add nuw nsw i32 %66, %44 | |
%68 = icmp slt i32 %67, %2 | |
br i1 %68, label %106, label %112 | |
.us-lcssa.us.us: ; preds = %112 | |
%.lcssa = phi float [ %114, %112 ] | |
%69 = load float*, float** %40, align 8 | |
%70 = sext i32 %41 to i64 | |
%71 = getelementptr inbounds float, float* %69, i64 %70 | |
%72 = bitcast float %.lcssa to i32 | |
%73 = bitcast float* %71 to i32* | |
%74 = load i32, i32* %73, align 4 | |
br label %46 | |
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
br label %._crit_edge | |
._crit_edge.loopexit60: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29 | |
ret void | |
.lr.ph.split: ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
%.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ] | |
%.idx45.val = load float, float* %.idx45, align 4 | |
%75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8 | |
%76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8 | |
%77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8 | |
%78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8 | |
%79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8 | |
%80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8 | |
%81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8 | |
%82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8 | |
%83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8 | |
%84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8 | |
%85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8 | |
%86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8 | |
%87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8 | |
%88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8 | |
%89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8 | |
%90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8 | |
%91 = srem i32 %.048, %3 | |
%92 = load float*, float** %40, align 8 | |
%93 = sext i32 %91 to i64 | |
%94 = getelementptr inbounds float, float* %92, i64 %93 | |
%95 = bitcast float %90 to i32 | |
%96 = bitcast float* %94 to i32* | |
%97 = load i32, i32* %96, align 4 | |
br label %98 | |
; <label>:98: ; preds = %101, %.lr.ph.split | |
%.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ] | |
%99 = bitcast i32 %.011.i to float | |
%100 = fcmp olt float %99, %90 | |
br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
; <label>:101: ; preds = %98 | |
%102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst | |
%103 = extractvalue { i32, i1 } %102, 0 | |
%not..i = icmp eq i32 %.011.i, %103 | |
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101 | |
%104 = add nuw nsw i32 %.048, 32768 | |
%105 = icmp slt i32 %104, %32 | |
br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60 | |
; <label>:106: ; preds = %63 | |
%107 = mul nsw i32 %67, %3 | |
%108 = add nsw i32 %107, %41 | |
%109 = sext i32 %108 to i64 | |
%110 = getelementptr inbounds float, float* %45, i64 %109 | |
%111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8 | |
br label %112 | |
; <label>:112: ; preds = %106, %63 | |
%113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ] | |
%114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8 | |
%115 = add nsw i32 %.04347.us.us, 2 | |
%exitcond.1 = icmp eq i32 %115, 16 | |
br i1 %exitcond.1, label %.us-lcssa.us.us, label %54 | |
} | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.15"* byval align 8, i32) #0 comdat { | |
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%5 = mul nuw nsw i32 %4, %3 | |
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%7 = add nuw nsw i32 %5, %6 | |
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%9 = mul nuw nsw i32 %8, %4 | |
%.sroa.021.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %0, i64 0, i32 0, i32 0 | |
%.sroa.021.0.copyload = load float*, float** %.sroa.021.0..sroa_idx, align 8 | |
%.sroa.5.0..sroa_idx25 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.15", %"struct.Eigen::TensorEvaluator.15"* %0, i64 0, i32 1, i32 3 | |
%.sroa.5.0.copyload = load float*, float** %.sroa.5.0..sroa_idx25, align 8 | |
%10 = icmp slt i32 %7, %1 | |
br i1 %10, label %.lr.ph.i.preheader, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit | |
.lr.ph.i.preheader: ; preds = %2 | |
br label %.lr.ph.i | |
.lr.ph.i: ; preds = %.lr.ph.i.preheader, %.lr.ph.i | |
%.07.i = phi i32 [ %17, %.lr.ph.i ], [ %7, %.lr.ph.i.preheader ] | |
%11 = sext i32 %.07.i to i64 | |
%12 = getelementptr inbounds float, float* %.sroa.5.0.copyload, i64 %11 | |
%13 = bitcast float* %12 to i32* | |
%14 = load i32, i32* %13, align 4 | |
%15 = getelementptr inbounds float, float* %.sroa.021.0.copyload, i64 %11 | |
%16 = bitcast float* %15 to i32* | |
store i32 %14, i32* %16, align 4 | |
%17 = add nsw i32 %.07.i, %9 | |
%18 = icmp slt i32 %17, %1 | |
br i1 %18, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit: ; preds = %.lr.ph.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiLb0EE3runESP_iii.exit.loopexit, %2 | |
ret void | |
} | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator.24"* byval align 8, i32) #0 comdat { | |
%3 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%4 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%5 = mul nuw nsw i32 %4, %3 | |
%6 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%7 = add nuw nsw i32 %5, %6 | |
%8 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%9 = mul nuw nsw i32 %8, %4 | |
%.sroa.041.0..sroa_idx = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 0, i32 0 | |
%.sroa.041.0.copyload = load float*, float** %.sroa.041.0..sroa_idx, align 8 | |
%.sroa.545.0..sroa_idx46 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 1, i32 7 | |
%.sroa.545.0..sroa_cast = bitcast i32* %.sroa.545.0..sroa_idx46 to i64* | |
%.sroa.545.0.copyload = load i64, i64* %.sroa.545.0..sroa_cast, align 8 | |
%.sroa.648.0..sroa_idx49 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 1, i32 9, i32 0, i64 0 | |
%.sroa.648.0.copyload = load i32, i32* %.sroa.648.0..sroa_idx49, align 8 | |
%.sroa.8.0..sroa_idx53 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.24", %"struct.Eigen::TensorEvaluator.24"* %0, i64 0, i32 1, i32 10, i32 0 | |
%.sroa.8.0.copyload = load float*, float** %.sroa.8.0..sroa_idx53, align 8 | |
%10 = icmp slt i32 %7, %1 | |
br i1 %10, label %.lr.ph.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit | |
.lr.ph.i: ; preds = %2 | |
%11 = trunc i64 %.sroa.545.0.copyload to i32 | |
%12 = icmp sgt i32 %.sroa.648.0.copyload, 0 | |
%13 = lshr i64 %.sroa.545.0.copyload, 32 | |
%14 = trunc i64 %13 to i32 | |
br i1 %12, label %.lr.ph.split.us.i.preheader, label %.lr.ph.split.i.preheader | |
.lr.ph.split.i.preheader: ; preds = %.lr.ph.i | |
br label %.lr.ph.split.i | |
.lr.ph.split.us.i.preheader: ; preds = %.lr.ph.i | |
%15 = add i32 %.sroa.648.0.copyload, -1 | |
%xtraiter = and i32 %.sroa.648.0.copyload, 3 | |
%16 = icmp ult i32 %15, 3 | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
%unroll_iter = sub i32 %.sroa.648.0.copyload, %xtraiter | |
br label %.lr.ph.split.us.i | |
.lr.ph.split.us.i: ; preds = %.lr.ph.split.us.i.preheader, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
%.07.us.i = phi i32 [ %59, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i ], [ %7, %.lr.ph.split.us.i.preheader ] | |
%17 = mul nsw i32 %.07.us.i, %11 | |
br i1 %16, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, label %.lr.ph.split.us.i.new | |
.lr.ph.split.us.i.new: ; preds = %.lr.ph.split.us.i | |
br label %18 | |
; <label>:18: ; preds = %18, %.lr.ph.split.us.i.new | |
%19 = phi float [ 0.000000e+00, %.lr.ph.split.us.i.new ], [ %46, %18 ] | |
%.012.i.i.i.us.i = phi i32 [ 0, %.lr.ph.split.us.i.new ], [ %47, %18 ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.us.i.new ], [ %niter.nsub.3, %18 ] | |
%20 = mul nsw i32 %.012.i.i.i.us.i, %14 | |
%21 = add nsw i32 %20, %17 | |
%22 = sext i32 %21 to i64 | |
%23 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %22 | |
%24 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %23, i32 4) #8 | |
%25 = fadd float %19, %24 | |
%26 = or i32 %.012.i.i.i.us.i, 1 | |
%27 = mul nsw i32 %26, %14 | |
%28 = add nsw i32 %27, %17 | |
%29 = sext i32 %28 to i64 | |
%30 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %29 | |
%31 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %30, i32 4) #8 | |
%32 = fadd float %25, %31 | |
%33 = or i32 %.012.i.i.i.us.i, 2 | |
%34 = mul nsw i32 %33, %14 | |
%35 = add nsw i32 %34, %17 | |
%36 = sext i32 %35 to i64 | |
%37 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %36 | |
%38 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %37, i32 4) #8 | |
%39 = fadd float %32, %38 | |
%40 = or i32 %.012.i.i.i.us.i, 3 | |
%41 = mul nsw i32 %40, %14 | |
%42 = add nsw i32 %41, %17 | |
%43 = sext i32 %42 to i64 | |
%44 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %43 | |
%45 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %44, i32 4) #8 | |
%46 = fadd float %39, %45 | |
%47 = add nsw i32 %.012.i.i.i.us.i, 4 | |
%niter.nsub.3 = add i32 %niter, -4 | |
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 | |
br i1 %niter.ncmp.3, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, label %18 | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit: ; preds = %18 | |
%.lcssa67 = phi i32 [ %47, %18 ] | |
%.lcssa66 = phi float [ %46, %18 ] | |
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit, %.lr.ph.split.us.i | |
%.lcssa.ph = phi float [ undef, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
%.unr = phi float [ 0.000000e+00, %.lr.ph.split.us.i ], [ %.lcssa66, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
%.012.i.i.i.us.i.unr = phi i32 [ 0, %.lr.ph.split.us.i ], [ %.lcssa67, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa.loopexit ] | |
br i1 %lcmp.mod, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i, label %.epil.preheader | |
.epil.preheader: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa | |
br label %48 | |
; <label>:48: ; preds = %48, %.epil.preheader | |
%49 = phi float [ %.unr, %.epil.preheader ], [ %55, %48 ] | |
%.012.i.i.i.us.i.epil = phi i32 [ %.012.i.i.i.us.i.unr, %.epil.preheader ], [ %56, %48 ] | |
%epil.iter = phi i32 [ %xtraiter, %.epil.preheader ], [ %epil.iter.sub, %48 ] | |
%50 = mul nsw i32 %.012.i.i.i.us.i.epil, %14 | |
%51 = add nsw i32 %50, %17 | |
%52 = sext i32 %51 to i64 | |
%53 = getelementptr inbounds float, float* %.sroa.8.0.copyload, i64 %52 | |
%54 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %53, i32 4) #8 | |
%55 = fadd float %49, %54 | |
%56 = add nuw nsw i32 %.012.i.i.i.us.i.epil, 1 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa, label %48, !llvm.loop !72 | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa: ; preds = %48 | |
%.lcssa68 = phi float [ %55, %48 ] | |
br label %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa | |
%.lcssa = phi float [ %.lcssa.ph, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.unr-lcssa ], [ %.lcssa68, %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i.epilog-lcssa ] | |
%57 = sext i32 %.07.us.i to i64 | |
%58 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %57 | |
store float %.lcssa, float* %58, align 4 | |
%59 = add nsw i32 %.07.us.i, %9 | |
%60 = icmp slt i32 %59, %1 | |
br i1 %60, label %.lr.ph.split.us.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit | |
.lr.ph.split.i: ; preds = %.lr.ph.split.i.preheader, %.lr.ph.split.i | |
%.07.i = phi i32 [ %63, %.lr.ph.split.i ], [ %7, %.lr.ph.split.i.preheader ] | |
%61 = sext i32 %.07.i to i64 | |
%62 = getelementptr inbounds float, float* %.sroa.041.0.copyload, i64 %61 | |
store float 0.000000e+00, float* %62, align 4 | |
%63 = add nsw i32 %.07.i, %9 | |
%64 = icmp slt i32 %63, %1 | |
br i1 %64, label %.lr.ph.split.i, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64 | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit: ; preds = %_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS_8internal10SumReducerIfEEKNS_5arrayIiLm1EEEKNS2_INS3_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEE10evalScalarEi.exit.loopexit.us.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64: ; preds = %.lr.ph.split.i | |
br label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit | |
_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit: ; preds = %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit64, %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiLb0EE3runESM_iii.exit.loopexit, %2 | |
ret void | |
} | |
; Function Attrs: nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_(float, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #0 comdat { | |
%4 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%5 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%6 = mul nuw nsw i32 %5, %4 | |
%7 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%8 = add nuw nsw i32 %6, %7 | |
%9 = icmp slt i32 %8, %1 | |
br i1 %9, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %3 | |
%10 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %2, i64 0, i32 0 | |
%11 = load float*, float** %10, align 8 | |
%12 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%13 = mul nuw nsw i32 %12, %5 | |
br label %14 | |
._crit_edge.loopexit: ; preds = %14 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %3 | |
ret void | |
; <label>:14: ; preds = %.lr.ph, %14 | |
%.08 = phi i32 [ %8, %.lr.ph ], [ %17, %14 ] | |
%15 = sext i32 %.08 to i64 | |
%16 = getelementptr inbounds float, float* %11, i64 %15 | |
store float %0, float* %16, align 4 | |
%17 = add i32 %13, %.08 | |
%18 = icmp slt i32 %17, %1 | |
br i1 %18, label %14, label %._crit_edge.loopexit | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 32 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %3, 32767 | |
%31 = sdiv i32 %30, 32768 | |
%32 = mul nsw i32 %31, %2 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = icmp slt i32 %33, %32 | |
br i1 %35, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%36 = and i32 %34, 31 | |
%.not = icmp ne i32 %36, 0 | |
%37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0 | |
%38 = load float*, float** %37, align 8 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0 | |
%40 = load float*, float** %39, align 8 | |
br label %41 | |
._crit_edge.loopexit: ; preds = %187 | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %29 | |
ret void | |
; <label>:41: ; preds = %.lr.ph, %187 | |
%.0114 = phi i32 [ %33, %.lr.ph ], [ %188, %187 ] | |
%42 = srem i32 %.0114, %31 | |
%43 = sdiv i32 %.0114, %31 | |
%44 = shl nsw i32 %42, 15 | |
%45 = or i32 %44, %34 | |
%46 = icmp slt i32 %43, %2 | |
br i1 %46, label %.preheader102, label %.thread.preheader | |
.thread.preheader.loopexit: ; preds = %.preheader.preheader | |
%.lcssa137 = phi float [ %164, %.preheader.preheader ] | |
br label %.thread.preheader | |
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %190, %200, %210, %220, %230, %240, %250, %260, %270, %280, %290, %300, %310, %320, %41 | |
%.8112.ph = phi float [ 0.000000e+00, %41 ], [ %325, %320 ], [ %315, %310 ], [ %305, %300 ], [ %295, %290 ], [ %285, %280 ], [ %275, %270 ], [ %265, %260 ], [ %255, %250 ], [ %245, %240 ], [ %235, %230 ], [ %225, %220 ], [ %215, %210 ], [ %205, %200 ], [ %195, %190 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ] | |
br label %.thread | |
.preheader102: ; preds = %41 | |
%47 = mul nsw i32 %43, %3 | |
%48 = add i32 %47, %45 | |
br label %49 | |
; <label>:49: ; preds = %.preheader102, %.preheader.preheader | |
%.095109 = phi float [ 0.000000e+00, %.preheader102 ], [ %164, %.preheader.preheader ] | |
%.098108 = phi i32 [ 0, %.preheader102 ], [ %50, %.preheader.preheader ] | |
%50 = add nuw nsw i32 %.098108, 16 | |
%51 = shl i32 %.098108, 8 | |
%52 = or i32 %51, 3840 | |
%53 = add nsw i32 %52, %45 | |
%54 = icmp slt i32 %53, %3 | |
br i1 %54, label %.preheader.preheader, label %.preheader101 | |
.preheader.preheader: ; preds = %49 | |
%55 = add i32 %48, %51 | |
%56 = sext i32 %55 to i64 | |
%57 = getelementptr inbounds float, float* %40, i64 %56 | |
%58 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %57, i32 4) #8 | |
%59 = fadd float %.095109, %58 | |
%60 = shl i32 %.098108, 8 | |
%61 = or i32 %60, 256 | |
%62 = add i32 %48, %61 | |
%63 = sext i32 %62 to i64 | |
%64 = getelementptr inbounds float, float* %40, i64 %63 | |
%65 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %64, i32 4) #8 | |
%66 = fadd float %59, %65 | |
%67 = shl i32 %.098108, 8 | |
%68 = or i32 %67, 512 | |
%69 = add i32 %48, %68 | |
%70 = sext i32 %69 to i64 | |
%71 = getelementptr inbounds float, float* %40, i64 %70 | |
%72 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %71, i32 4) #8 | |
%73 = fadd float %66, %72 | |
%74 = shl i32 %.098108, 8 | |
%75 = or i32 %74, 768 | |
%76 = add i32 %48, %75 | |
%77 = sext i32 %76 to i64 | |
%78 = getelementptr inbounds float, float* %40, i64 %77 | |
%79 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %78, i32 4) #8 | |
%80 = fadd float %73, %79 | |
%81 = shl i32 %.098108, 8 | |
%82 = or i32 %81, 1024 | |
%83 = add i32 %48, %82 | |
%84 = sext i32 %83 to i64 | |
%85 = getelementptr inbounds float, float* %40, i64 %84 | |
%86 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %85, i32 4) #8 | |
%87 = fadd float %80, %86 | |
%88 = shl i32 %.098108, 8 | |
%89 = or i32 %88, 1280 | |
%90 = add i32 %48, %89 | |
%91 = sext i32 %90 to i64 | |
%92 = getelementptr inbounds float, float* %40, i64 %91 | |
%93 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %92, i32 4) #8 | |
%94 = fadd float %87, %93 | |
%95 = shl i32 %.098108, 8 | |
%96 = or i32 %95, 1536 | |
%97 = add i32 %48, %96 | |
%98 = sext i32 %97 to i64 | |
%99 = getelementptr inbounds float, float* %40, i64 %98 | |
%100 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %99, i32 4) #8 | |
%101 = fadd float %94, %100 | |
%102 = shl i32 %.098108, 8 | |
%103 = or i32 %102, 1792 | |
%104 = add i32 %48, %103 | |
%105 = sext i32 %104 to i64 | |
%106 = getelementptr inbounds float, float* %40, i64 %105 | |
%107 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %106, i32 4) #8 | |
%108 = fadd float %101, %107 | |
%109 = shl i32 %.098108, 8 | |
%110 = or i32 %109, 2048 | |
%111 = add i32 %48, %110 | |
%112 = sext i32 %111 to i64 | |
%113 = getelementptr inbounds float, float* %40, i64 %112 | |
%114 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %113, i32 4) #8 | |
%115 = fadd float %108, %114 | |
%116 = shl i32 %.098108, 8 | |
%117 = or i32 %116, 2304 | |
%118 = add i32 %48, %117 | |
%119 = sext i32 %118 to i64 | |
%120 = getelementptr inbounds float, float* %40, i64 %119 | |
%121 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %120, i32 4) #8 | |
%122 = fadd float %115, %121 | |
%123 = shl i32 %.098108, 8 | |
%124 = or i32 %123, 2560 | |
%125 = add i32 %48, %124 | |
%126 = sext i32 %125 to i64 | |
%127 = getelementptr inbounds float, float* %40, i64 %126 | |
%128 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %127, i32 4) #8 | |
%129 = fadd float %122, %128 | |
%130 = shl i32 %.098108, 8 | |
%131 = or i32 %130, 2816 | |
%132 = add i32 %48, %131 | |
%133 = sext i32 %132 to i64 | |
%134 = getelementptr inbounds float, float* %40, i64 %133 | |
%135 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %134, i32 4) #8 | |
%136 = fadd float %129, %135 | |
%137 = shl i32 %.098108, 8 | |
%138 = or i32 %137, 3072 | |
%139 = add i32 %48, %138 | |
%140 = sext i32 %139 to i64 | |
%141 = getelementptr inbounds float, float* %40, i64 %140 | |
%142 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %141, i32 4) #8 | |
%143 = fadd float %136, %142 | |
%144 = shl i32 %.098108, 8 | |
%145 = or i32 %144, 3328 | |
%146 = add i32 %48, %145 | |
%147 = sext i32 %146 to i64 | |
%148 = getelementptr inbounds float, float* %40, i64 %147 | |
%149 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %148, i32 4) #8 | |
%150 = fadd float %143, %149 | |
%151 = shl i32 %.098108, 8 | |
%152 = or i32 %151, 3584 | |
%153 = add i32 %48, %152 | |
%154 = sext i32 %153 to i64 | |
%155 = getelementptr inbounds float, float* %40, i64 %154 | |
%156 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %155, i32 4) #8 | |
%157 = fadd float %150, %156 | |
%158 = shl i32 %.098108, 8 | |
%159 = or i32 %158, 3840 | |
%160 = add i32 %48, %159 | |
%161 = sext i32 %160 to i64 | |
%162 = getelementptr inbounds float, float* %40, i64 %161 | |
%163 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %162, i32 4) #8 | |
%164 = fadd float %157, %163 | |
%165 = icmp slt i32 %50, 128 | |
br i1 %165, label %49, label %.thread.preheader.loopexit | |
.preheader101: ; preds = %49 | |
%.lcssa = phi i32 [ %51, %49 ] | |
%.098108.lcssa = phi i32 [ %.098108, %49 ] | |
%.095109.lcssa = phi float [ %.095109, %49 ] | |
%166 = add nsw i32 %.lcssa, %45 | |
%167 = icmp slt i32 %166, %3 | |
br i1 %167, label %168, label %.thread.preheader | |
; <label>:168: ; preds = %.preheader101 | |
%169 = add nsw i32 %166, %47 | |
%170 = sext i32 %169 to i64 | |
%171 = getelementptr inbounds float, float* %40, i64 %170 | |
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8 | |
%173 = fadd float %.095109.lcssa, %172 | |
%174 = shl i32 %.098108.lcssa, 8 | |
%175 = or i32 %174, 256 | |
%176 = add nsw i32 %175, %45 | |
%177 = icmp slt i32 %176, %3 | |
br i1 %177, label %190, label %.thread.preheader | |
; <label>:178: ; preds = %.thread | |
%.lcssa138 = phi float [ %180, %.thread ] | |
%.not99 = xor i1 %46, true | |
%brmerge = or i1 %.not, %.not99 | |
br i1 %brmerge, label %187, label %183 | |
.thread: ; preds = %.thread.preheader, %.thread | |
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ] | |
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ] | |
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53 | |
%180 = fadd float %.8112, %179 | |
%181 = lshr i32 %.092113, 1 | |
%182 = icmp eq i32 %181, 0 | |
br i1 %182, label %178, label %.thread, !llvm.loop !73 | |
; <label>:183: ; preds = %178 | |
%184 = sext i32 %43 to i64 | |
%185 = getelementptr inbounds float, float* %38, i64 %184 | |
%186 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %185, float %.lcssa138) #8 | |
br label %187 | |
; <label>:187: ; preds = %178, %183 | |
%188 = add nuw nsw i32 %.0114, 32 | |
%189 = icmp slt i32 %188, %32 | |
br i1 %189, label %41, label %._crit_edge.loopexit | |
; <label>:190: ; preds = %168 | |
%191 = add nsw i32 %176, %47 | |
%192 = sext i32 %191 to i64 | |
%193 = getelementptr inbounds float, float* %40, i64 %192 | |
%194 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %193, i32 4) #8 | |
%195 = fadd float %173, %194 | |
%196 = shl i32 %.098108.lcssa, 8 | |
%197 = or i32 %196, 512 | |
%198 = add nsw i32 %197, %45 | |
%199 = icmp slt i32 %198, %3 | |
br i1 %199, label %200, label %.thread.preheader | |
; <label>:200: ; preds = %190 | |
%201 = add nsw i32 %198, %47 | |
%202 = sext i32 %201 to i64 | |
%203 = getelementptr inbounds float, float* %40, i64 %202 | |
%204 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %203, i32 4) #8 | |
%205 = fadd float %195, %204 | |
%206 = shl i32 %.098108.lcssa, 8 | |
%207 = or i32 %206, 768 | |
%208 = add nsw i32 %207, %45 | |
%209 = icmp slt i32 %208, %3 | |
br i1 %209, label %210, label %.thread.preheader | |
; <label>:210: ; preds = %200 | |
%211 = add nsw i32 %208, %47 | |
%212 = sext i32 %211 to i64 | |
%213 = getelementptr inbounds float, float* %40, i64 %212 | |
%214 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %213, i32 4) #8 | |
%215 = fadd float %205, %214 | |
%216 = shl i32 %.098108.lcssa, 8 | |
%217 = or i32 %216, 1024 | |
%218 = add nsw i32 %217, %45 | |
%219 = icmp slt i32 %218, %3 | |
br i1 %219, label %220, label %.thread.preheader | |
; <label>:220: ; preds = %210 | |
%221 = add nsw i32 %218, %47 | |
%222 = sext i32 %221 to i64 | |
%223 = getelementptr inbounds float, float* %40, i64 %222 | |
%224 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %223, i32 4) #8 | |
%225 = fadd float %215, %224 | |
%226 = shl i32 %.098108.lcssa, 8 | |
%227 = or i32 %226, 1280 | |
%228 = add nsw i32 %227, %45 | |
%229 = icmp slt i32 %228, %3 | |
br i1 %229, label %230, label %.thread.preheader | |
; <label>:230: ; preds = %220 | |
%231 = add nsw i32 %228, %47 | |
%232 = sext i32 %231 to i64 | |
%233 = getelementptr inbounds float, float* %40, i64 %232 | |
%234 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %233, i32 4) #8 | |
%235 = fadd float %225, %234 | |
%236 = shl i32 %.098108.lcssa, 8 | |
%237 = or i32 %236, 1536 | |
%238 = add nsw i32 %237, %45 | |
%239 = icmp slt i32 %238, %3 | |
br i1 %239, label %240, label %.thread.preheader | |
; <label>:240: ; preds = %230 | |
%241 = add nsw i32 %238, %47 | |
%242 = sext i32 %241 to i64 | |
%243 = getelementptr inbounds float, float* %40, i64 %242 | |
%244 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %243, i32 4) #8 | |
%245 = fadd float %235, %244 | |
%246 = shl i32 %.098108.lcssa, 8 | |
%247 = or i32 %246, 1792 | |
%248 = add nsw i32 %247, %45 | |
%249 = icmp slt i32 %248, %3 | |
br i1 %249, label %250, label %.thread.preheader | |
; <label>:250: ; preds = %240 | |
%251 = add nsw i32 %248, %47 | |
%252 = sext i32 %251 to i64 | |
%253 = getelementptr inbounds float, float* %40, i64 %252 | |
%254 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %253, i32 4) #8 | |
%255 = fadd float %245, %254 | |
%256 = shl i32 %.098108.lcssa, 8 | |
%257 = or i32 %256, 2048 | |
%258 = add nsw i32 %257, %45 | |
%259 = icmp slt i32 %258, %3 | |
br i1 %259, label %260, label %.thread.preheader | |
; <label>:260: ; preds = %250 | |
%261 = add nsw i32 %258, %47 | |
%262 = sext i32 %261 to i64 | |
%263 = getelementptr inbounds float, float* %40, i64 %262 | |
%264 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %263, i32 4) #8 | |
%265 = fadd float %255, %264 | |
%266 = shl i32 %.098108.lcssa, 8 | |
%267 = or i32 %266, 2304 | |
%268 = add nsw i32 %267, %45 | |
%269 = icmp slt i32 %268, %3 | |
br i1 %269, label %270, label %.thread.preheader | |
; <label>:270: ; preds = %260 | |
%271 = add nsw i32 %268, %47 | |
%272 = sext i32 %271 to i64 | |
%273 = getelementptr inbounds float, float* %40, i64 %272 | |
%274 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %273, i32 4) #8 | |
%275 = fadd float %265, %274 | |
%276 = shl i32 %.098108.lcssa, 8 | |
%277 = or i32 %276, 2560 | |
%278 = add nsw i32 %277, %45 | |
%279 = icmp slt i32 %278, %3 | |
br i1 %279, label %280, label %.thread.preheader | |
; <label>:280: ; preds = %270 | |
%281 = add nsw i32 %278, %47 | |
%282 = sext i32 %281 to i64 | |
%283 = getelementptr inbounds float, float* %40, i64 %282 | |
%284 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %283, i32 4) #8 | |
%285 = fadd float %275, %284 | |
%286 = shl i32 %.098108.lcssa, 8 | |
%287 = or i32 %286, 2816 | |
%288 = add nsw i32 %287, %45 | |
%289 = icmp slt i32 %288, %3 | |
br i1 %289, label %290, label %.thread.preheader | |
; <label>:290: ; preds = %280 | |
%291 = add nsw i32 %288, %47 | |
%292 = sext i32 %291 to i64 | |
%293 = getelementptr inbounds float, float* %40, i64 %292 | |
%294 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %293, i32 4) #8 | |
%295 = fadd float %285, %294 | |
%296 = shl i32 %.098108.lcssa, 8 | |
%297 = or i32 %296, 3072 | |
%298 = add nsw i32 %297, %45 | |
%299 = icmp slt i32 %298, %3 | |
br i1 %299, label %300, label %.thread.preheader | |
; <label>:300: ; preds = %290 | |
%301 = add nsw i32 %298, %47 | |
%302 = sext i32 %301 to i64 | |
%303 = getelementptr inbounds float, float* %40, i64 %302 | |
%304 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %303, i32 4) #8 | |
%305 = fadd float %295, %304 | |
%306 = shl i32 %.098108.lcssa, 8 | |
%307 = or i32 %306, 3328 | |
%308 = add nsw i32 %307, %45 | |
%309 = icmp slt i32 %308, %3 | |
br i1 %309, label %310, label %.thread.preheader | |
; <label>:310: ; preds = %300 | |
%311 = add nsw i32 %308, %47 | |
%312 = sext i32 %311 to i64 | |
%313 = getelementptr inbounds float, float* %40, i64 %312 | |
%314 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %313, i32 4) #8 | |
%315 = fadd float %305, %314 | |
%316 = shl i32 %.098108.lcssa, 8 | |
%317 = or i32 %316, 3584 | |
%318 = add nsw i32 %317, %45 | |
%319 = icmp slt i32 %318, %3 | |
br i1 %319, label %320, label %.thread.preheader | |
; <label>:320: ; preds = %310 | |
%321 = add nsw i32 %318, %47 | |
%322 = sext i32 %321 to i64 | |
%323 = getelementptr inbounds float, float* %40, i64 %322 | |
%324 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %323, i32 4) #8 | |
%325 = fadd float %315, %324 | |
br label %.thread.preheader | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 133, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 134, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 135, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 32 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 137, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 138, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 139, i8* getelementptr inbounds ([437 x i8], [437 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %3, 32767 | |
%31 = sdiv i32 %30, 32768 | |
%32 = mul nsw i32 %31, %2 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = icmp slt i32 %33, %32 | |
br i1 %35, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%.idx = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0 | |
%36 = and i32 %34, 31 | |
%.not = icmp ne i32 %36, 0 | |
%37 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0 | |
%38 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0 | |
br label %39 | |
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit, %29 | |
ret void | |
; <label>:39: ; preds = %.lr.ph, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
%.0114 = phi i32 [ %33, %.lr.ph ], [ %196, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ] | |
%40 = srem i32 %.0114, %31 | |
%41 = sdiv i32 %.0114, %31 | |
%42 = shl nsw i32 %40, 15 | |
%43 = or i32 %42, %34 | |
%.idx.val = load float, float* %.idx, align 4 | |
%44 = icmp slt i32 %41, %2 | |
br i1 %44, label %.preheader102, label %.thread.preheader | |
.thread.preheader.loopexit: ; preds = %.preheader.preheader | |
%.lcssa137 = phi float [ %163, %.preheader.preheader ] | |
br label %.thread.preheader | |
.thread.preheader: ; preds = %.thread.preheader.loopexit, %.preheader101, %168, %198, %208, %218, %228, %238, %248, %258, %268, %278, %288, %298, %308, %318, %328, %39 | |
%.8112.ph = phi float [ %.idx.val, %39 ], [ %333, %328 ], [ %323, %318 ], [ %313, %308 ], [ %303, %298 ], [ %293, %288 ], [ %283, %278 ], [ %273, %268 ], [ %263, %258 ], [ %253, %248 ], [ %243, %238 ], [ %233, %228 ], [ %223, %218 ], [ %213, %208 ], [ %203, %198 ], [ %173, %168 ], [ %.095109.lcssa, %.preheader101 ], [ %.lcssa137, %.thread.preheader.loopexit ] | |
br label %.thread | |
.preheader102: ; preds = %39 | |
%45 = mul nsw i32 %41, %3 | |
%46 = add i32 %45, %43 | |
%47 = load float*, float** %38, align 8 | |
br label %48 | |
; <label>:48: ; preds = %.preheader102, %.preheader.preheader | |
%.095109 = phi float [ %.idx.val, %.preheader102 ], [ %163, %.preheader.preheader ] | |
%.098108 = phi i32 [ 0, %.preheader102 ], [ %49, %.preheader.preheader ] | |
%49 = add nuw nsw i32 %.098108, 16 | |
%50 = shl i32 %.098108, 8 | |
%51 = or i32 %50, 3840 | |
%52 = add nsw i32 %51, %43 | |
%53 = icmp slt i32 %52, %3 | |
br i1 %53, label %.preheader.preheader, label %.preheader101 | |
.preheader.preheader: ; preds = %48 | |
%54 = add i32 %46, %50 | |
%55 = sext i32 %54 to i64 | |
%56 = getelementptr inbounds float, float* %47, i64 %55 | |
%57 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %56, i32 4) #8 | |
%58 = tail call float @llvm.nvvm.fmax.f(float %.095109, float %57) #8 | |
%59 = shl i32 %.098108, 8 | |
%60 = or i32 %59, 256 | |
%61 = add i32 %46, %60 | |
%62 = sext i32 %61 to i64 | |
%63 = getelementptr inbounds float, float* %47, i64 %62 | |
%64 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %63, i32 4) #8 | |
%65 = tail call float @llvm.nvvm.fmax.f(float %58, float %64) #8 | |
%66 = shl i32 %.098108, 8 | |
%67 = or i32 %66, 512 | |
%68 = add i32 %46, %67 | |
%69 = sext i32 %68 to i64 | |
%70 = getelementptr inbounds float, float* %47, i64 %69 | |
%71 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %70, i32 4) #8 | |
%72 = tail call float @llvm.nvvm.fmax.f(float %65, float %71) #8 | |
%73 = shl i32 %.098108, 8 | |
%74 = or i32 %73, 768 | |
%75 = add i32 %46, %74 | |
%76 = sext i32 %75 to i64 | |
%77 = getelementptr inbounds float, float* %47, i64 %76 | |
%78 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %77, i32 4) #8 | |
%79 = tail call float @llvm.nvvm.fmax.f(float %72, float %78) #8 | |
%80 = shl i32 %.098108, 8 | |
%81 = or i32 %80, 1024 | |
%82 = add i32 %46, %81 | |
%83 = sext i32 %82 to i64 | |
%84 = getelementptr inbounds float, float* %47, i64 %83 | |
%85 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %84, i32 4) #8 | |
%86 = tail call float @llvm.nvvm.fmax.f(float %79, float %85) #8 | |
%87 = shl i32 %.098108, 8 | |
%88 = or i32 %87, 1280 | |
%89 = add i32 %46, %88 | |
%90 = sext i32 %89 to i64 | |
%91 = getelementptr inbounds float, float* %47, i64 %90 | |
%92 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %91, i32 4) #8 | |
%93 = tail call float @llvm.nvvm.fmax.f(float %86, float %92) #8 | |
%94 = shl i32 %.098108, 8 | |
%95 = or i32 %94, 1536 | |
%96 = add i32 %46, %95 | |
%97 = sext i32 %96 to i64 | |
%98 = getelementptr inbounds float, float* %47, i64 %97 | |
%99 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %98, i32 4) #8 | |
%100 = tail call float @llvm.nvvm.fmax.f(float %93, float %99) #8 | |
%101 = shl i32 %.098108, 8 | |
%102 = or i32 %101, 1792 | |
%103 = add i32 %46, %102 | |
%104 = sext i32 %103 to i64 | |
%105 = getelementptr inbounds float, float* %47, i64 %104 | |
%106 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %105, i32 4) #8 | |
%107 = tail call float @llvm.nvvm.fmax.f(float %100, float %106) #8 | |
%108 = shl i32 %.098108, 8 | |
%109 = or i32 %108, 2048 | |
%110 = add i32 %46, %109 | |
%111 = sext i32 %110 to i64 | |
%112 = getelementptr inbounds float, float* %47, i64 %111 | |
%113 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %112, i32 4) #8 | |
%114 = tail call float @llvm.nvvm.fmax.f(float %107, float %113) #8 | |
%115 = shl i32 %.098108, 8 | |
%116 = or i32 %115, 2304 | |
%117 = add i32 %46, %116 | |
%118 = sext i32 %117 to i64 | |
%119 = getelementptr inbounds float, float* %47, i64 %118 | |
%120 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %119, i32 4) #8 | |
%121 = tail call float @llvm.nvvm.fmax.f(float %114, float %120) #8 | |
%122 = shl i32 %.098108, 8 | |
%123 = or i32 %122, 2560 | |
%124 = add i32 %46, %123 | |
%125 = sext i32 %124 to i64 | |
%126 = getelementptr inbounds float, float* %47, i64 %125 | |
%127 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %126, i32 4) #8 | |
%128 = tail call float @llvm.nvvm.fmax.f(float %121, float %127) #8 | |
%129 = shl i32 %.098108, 8 | |
%130 = or i32 %129, 2816 | |
%131 = add i32 %46, %130 | |
%132 = sext i32 %131 to i64 | |
%133 = getelementptr inbounds float, float* %47, i64 %132 | |
%134 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %133, i32 4) #8 | |
%135 = tail call float @llvm.nvvm.fmax.f(float %128, float %134) #8 | |
%136 = shl i32 %.098108, 8 | |
%137 = or i32 %136, 3072 | |
%138 = add i32 %46, %137 | |
%139 = sext i32 %138 to i64 | |
%140 = getelementptr inbounds float, float* %47, i64 %139 | |
%141 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %140, i32 4) #8 | |
%142 = tail call float @llvm.nvvm.fmax.f(float %135, float %141) #8 | |
%143 = shl i32 %.098108, 8 | |
%144 = or i32 %143, 3328 | |
%145 = add i32 %46, %144 | |
%146 = sext i32 %145 to i64 | |
%147 = getelementptr inbounds float, float* %47, i64 %146 | |
%148 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %147, i32 4) #8 | |
%149 = tail call float @llvm.nvvm.fmax.f(float %142, float %148) #8 | |
%150 = shl i32 %.098108, 8 | |
%151 = or i32 %150, 3584 | |
%152 = add i32 %46, %151 | |
%153 = sext i32 %152 to i64 | |
%154 = getelementptr inbounds float, float* %47, i64 %153 | |
%155 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %154, i32 4) #8 | |
%156 = tail call float @llvm.nvvm.fmax.f(float %149, float %155) #8 | |
%157 = shl i32 %.098108, 8 | |
%158 = or i32 %157, 3840 | |
%159 = add i32 %46, %158 | |
%160 = sext i32 %159 to i64 | |
%161 = getelementptr inbounds float, float* %47, i64 %160 | |
%162 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %161, i32 4) #8 | |
%163 = tail call float @llvm.nvvm.fmax.f(float %156, float %162) #8 | |
%164 = icmp slt i32 %49, 128 | |
br i1 %164, label %48, label %.thread.preheader.loopexit | |
.preheader101: ; preds = %48 | |
%.lcssa = phi i32 [ %50, %48 ] | |
%.098108.lcssa = phi i32 [ %.098108, %48 ] | |
%.095109.lcssa = phi float [ %.095109, %48 ] | |
%165 = load float*, float** %38, align 8 | |
%166 = add nsw i32 %.lcssa, %43 | |
%167 = icmp slt i32 %166, %3 | |
br i1 %167, label %168, label %.thread.preheader | |
; <label>:168: ; preds = %.preheader101 | |
%169 = add nsw i32 %166, %45 | |
%170 = sext i32 %169 to i64 | |
%171 = getelementptr inbounds float, float* %165, i64 %170 | |
%172 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %171, i32 4) #8 | |
%173 = tail call float @llvm.nvvm.fmax.f(float %.095109.lcssa, float %172) #8 | |
%174 = shl i32 %.098108.lcssa, 8 | |
%175 = or i32 %174, 256 | |
%176 = add nsw i32 %175, %43 | |
%177 = icmp slt i32 %176, %3 | |
br i1 %177, label %198, label %.thread.preheader | |
; <label>:178: ; preds = %.thread | |
%.lcssa138 = phi float [ %180, %.thread ] | |
%.not99 = xor i1 %44, true | |
%brmerge = or i1 %.not, %.not99 | |
br i1 %brmerge, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %183 | |
.thread: ; preds = %.thread.preheader, %.thread | |
%.092113 = phi i32 [ %181, %.thread ], [ 16, %.thread.preheader ] | |
%.8112 = phi float [ %180, %.thread ], [ %.8112.ph, %.thread.preheader ] | |
%179 = tail call float asm sideeffect "shfl.down.b32 $0, $1, $2, $3;", "=f,f,r,r"(float %.8112, i32 %.092113, i32 31) #3, !srcloc !53 | |
%180 = tail call float @llvm.nvvm.fmax.f(float %179, float %.8112) #8 | |
%181 = lshr i32 %.092113, 1 | |
%182 = icmp eq i32 %181, 0 | |
br i1 %182, label %178, label %.thread, !llvm.loop !74 | |
; <label>:183: ; preds = %178 | |
%184 = sext i32 %41 to i64 | |
%185 = load float*, float** %37, align 8 | |
%186 = getelementptr inbounds float, float* %185, i64 %184 | |
%187 = bitcast float %.lcssa138 to i32 | |
%188 = bitcast float* %186 to i32* | |
%189 = load i32, i32* %188, align 4 | |
br label %190 | |
; <label>:190: ; preds = %193, %183 | |
%.011.i = phi i32 [ %189, %183 ], [ %195, %193 ] | |
%191 = bitcast i32 %.011.i to float | |
%192 = fcmp olt float %191, %.lcssa138 | |
br i1 %192, label %193, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit | |
; <label>:193: ; preds = %190 | |
%194 = cmpxchg i32* %188, i32 %.011.i, i32 %187 seq_cst seq_cst | |
%195 = extractvalue { i32, i1 } %194, 0 | |
%not..i = icmp eq i32 %.011.i, %195 | |
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, label %190 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit: ; preds = %190, %193 | |
br label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.loopexit, %178 | |
%196 = add nuw nsw i32 %.0114, 32 | |
%197 = icmp slt i32 %196, %32 | |
br i1 %197, label %39, label %._crit_edge.loopexit | |
; <label>:198: ; preds = %168 | |
%199 = add nsw i32 %176, %45 | |
%200 = sext i32 %199 to i64 | |
%201 = getelementptr inbounds float, float* %165, i64 %200 | |
%202 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %201, i32 4) #8 | |
%203 = tail call float @llvm.nvvm.fmax.f(float %173, float %202) #8 | |
%204 = shl i32 %.098108.lcssa, 8 | |
%205 = or i32 %204, 512 | |
%206 = add nsw i32 %205, %43 | |
%207 = icmp slt i32 %206, %3 | |
br i1 %207, label %208, label %.thread.preheader | |
; <label>:208: ; preds = %198 | |
%209 = add nsw i32 %206, %45 | |
%210 = sext i32 %209 to i64 | |
%211 = getelementptr inbounds float, float* %165, i64 %210 | |
%212 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %211, i32 4) #8 | |
%213 = tail call float @llvm.nvvm.fmax.f(float %203, float %212) #8 | |
%214 = shl i32 %.098108.lcssa, 8 | |
%215 = or i32 %214, 768 | |
%216 = add nsw i32 %215, %43 | |
%217 = icmp slt i32 %216, %3 | |
br i1 %217, label %218, label %.thread.preheader | |
; <label>:218: ; preds = %208 | |
%219 = add nsw i32 %216, %45 | |
%220 = sext i32 %219 to i64 | |
%221 = getelementptr inbounds float, float* %165, i64 %220 | |
%222 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %221, i32 4) #8 | |
%223 = tail call float @llvm.nvvm.fmax.f(float %213, float %222) #8 | |
%224 = shl i32 %.098108.lcssa, 8 | |
%225 = or i32 %224, 1024 | |
%226 = add nsw i32 %225, %43 | |
%227 = icmp slt i32 %226, %3 | |
br i1 %227, label %228, label %.thread.preheader | |
; <label>:228: ; preds = %218 | |
%229 = add nsw i32 %226, %45 | |
%230 = sext i32 %229 to i64 | |
%231 = getelementptr inbounds float, float* %165, i64 %230 | |
%232 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %231, i32 4) #8 | |
%233 = tail call float @llvm.nvvm.fmax.f(float %223, float %232) #8 | |
%234 = shl i32 %.098108.lcssa, 8 | |
%235 = or i32 %234, 1280 | |
%236 = add nsw i32 %235, %43 | |
%237 = icmp slt i32 %236, %3 | |
br i1 %237, label %238, label %.thread.preheader | |
; <label>:238: ; preds = %228 | |
%239 = add nsw i32 %236, %45 | |
%240 = sext i32 %239 to i64 | |
%241 = getelementptr inbounds float, float* %165, i64 %240 | |
%242 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %241, i32 4) #8 | |
%243 = tail call float @llvm.nvvm.fmax.f(float %233, float %242) #8 | |
%244 = shl i32 %.098108.lcssa, 8 | |
%245 = or i32 %244, 1536 | |
%246 = add nsw i32 %245, %43 | |
%247 = icmp slt i32 %246, %3 | |
br i1 %247, label %248, label %.thread.preheader | |
; <label>:248: ; preds = %238 | |
%249 = add nsw i32 %246, %45 | |
%250 = sext i32 %249 to i64 | |
%251 = getelementptr inbounds float, float* %165, i64 %250 | |
%252 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %251, i32 4) #8 | |
%253 = tail call float @llvm.nvvm.fmax.f(float %243, float %252) #8 | |
%254 = shl i32 %.098108.lcssa, 8 | |
%255 = or i32 %254, 1792 | |
%256 = add nsw i32 %255, %43 | |
%257 = icmp slt i32 %256, %3 | |
br i1 %257, label %258, label %.thread.preheader | |
; <label>:258: ; preds = %248 | |
%259 = add nsw i32 %256, %45 | |
%260 = sext i32 %259 to i64 | |
%261 = getelementptr inbounds float, float* %165, i64 %260 | |
%262 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %261, i32 4) #8 | |
%263 = tail call float @llvm.nvvm.fmax.f(float %253, float %262) #8 | |
%264 = shl i32 %.098108.lcssa, 8 | |
%265 = or i32 %264, 2048 | |
%266 = add nsw i32 %265, %43 | |
%267 = icmp slt i32 %266, %3 | |
br i1 %267, label %268, label %.thread.preheader | |
; <label>:268: ; preds = %258 | |
%269 = add nsw i32 %266, %45 | |
%270 = sext i32 %269 to i64 | |
%271 = getelementptr inbounds float, float* %165, i64 %270 | |
%272 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %271, i32 4) #8 | |
%273 = tail call float @llvm.nvvm.fmax.f(float %263, float %272) #8 | |
%274 = shl i32 %.098108.lcssa, 8 | |
%275 = or i32 %274, 2304 | |
%276 = add nsw i32 %275, %43 | |
%277 = icmp slt i32 %276, %3 | |
br i1 %277, label %278, label %.thread.preheader | |
; <label>:278: ; preds = %268 | |
%279 = add nsw i32 %276, %45 | |
%280 = sext i32 %279 to i64 | |
%281 = getelementptr inbounds float, float* %165, i64 %280 | |
%282 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %281, i32 4) #8 | |
%283 = tail call float @llvm.nvvm.fmax.f(float %273, float %282) #8 | |
%284 = shl i32 %.098108.lcssa, 8 | |
%285 = or i32 %284, 2560 | |
%286 = add nsw i32 %285, %43 | |
%287 = icmp slt i32 %286, %3 | |
br i1 %287, label %288, label %.thread.preheader | |
; <label>:288: ; preds = %278 | |
%289 = add nsw i32 %286, %45 | |
%290 = sext i32 %289 to i64 | |
%291 = getelementptr inbounds float, float* %165, i64 %290 | |
%292 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %291, i32 4) #8 | |
%293 = tail call float @llvm.nvvm.fmax.f(float %283, float %292) #8 | |
%294 = shl i32 %.098108.lcssa, 8 | |
%295 = or i32 %294, 2816 | |
%296 = add nsw i32 %295, %43 | |
%297 = icmp slt i32 %296, %3 | |
br i1 %297, label %298, label %.thread.preheader | |
; <label>:298: ; preds = %288 | |
%299 = add nsw i32 %296, %45 | |
%300 = sext i32 %299 to i64 | |
%301 = getelementptr inbounds float, float* %165, i64 %300 | |
%302 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %301, i32 4) #8 | |
%303 = tail call float @llvm.nvvm.fmax.f(float %293, float %302) #8 | |
%304 = shl i32 %.098108.lcssa, 8 | |
%305 = or i32 %304, 3072 | |
%306 = add nsw i32 %305, %43 | |
%307 = icmp slt i32 %306, %3 | |
br i1 %307, label %308, label %.thread.preheader | |
; <label>:308: ; preds = %298 | |
%309 = add nsw i32 %306, %45 | |
%310 = sext i32 %309 to i64 | |
%311 = getelementptr inbounds float, float* %165, i64 %310 | |
%312 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %311, i32 4) #8 | |
%313 = tail call float @llvm.nvvm.fmax.f(float %303, float %312) #8 | |
%314 = shl i32 %.098108.lcssa, 8 | |
%315 = or i32 %314, 3328 | |
%316 = add nsw i32 %315, %43 | |
%317 = icmp slt i32 %316, %3 | |
br i1 %317, label %318, label %.thread.preheader | |
; <label>:318: ; preds = %308 | |
%319 = add nsw i32 %316, %45 | |
%320 = sext i32 %319 to i64 | |
%321 = getelementptr inbounds float, float* %165, i64 %320 | |
%322 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %321, i32 4) #8 | |
%323 = tail call float @llvm.nvvm.fmax.f(float %313, float %322) #8 | |
%324 = shl i32 %.098108.lcssa, 8 | |
%325 = or i32 %324, 3584 | |
%326 = add nsw i32 %325, %43 | |
%327 = icmp slt i32 %326, %3 | |
br i1 %327, label %328, label %.thread.preheader | |
; <label>:328: ; preds = %318 | |
%329 = add nsw i32 %326, %45 | |
%330 = sext i32 %329 to i64 | |
%331 = getelementptr inbounds float, float* %165, i64 %330 | |
%332 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %331, i32 4) #8 | |
%333 = tail call float @llvm.nvvm.fmax.f(float %323, float %332) #8 | |
br label %.thread.preheader | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"* byval align 1, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 128 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %2, 15 | |
%31 = sdiv i32 %30, 16 | |
%32 = mul nsw i32 %31, %3 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = shl nuw nsw i32 %33, 8 | |
%36 = add nuw nsw i32 %35, %34 | |
%37 = icmp slt i32 %36, %32 | |
br i1 %37, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%38 = icmp sgt i32 %3, -1 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0 | |
%40 = load float*, float** %39, align 8 | |
%41 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0 | |
%42 = load float*, float** %41, align 8 | |
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader | |
.lr.ph.split.preheader: ; preds = %.lr.ph | |
%43 = add i32 %32, -1 | |
%44 = sub i32 %43, %34 | |
%45 = sub i32 %44, %35 | |
%46 = lshr i32 %45, 15 | |
%47 = add nuw nsw i32 %46, 1 | |
%xtraiter = and i32 %47, 3 | |
%48 = icmp ult i32 %45, 98304 | |
br i1 %48, label %._crit_edge.loopexit59.unr-lcssa, label %.lr.ph.split.preheader.new | |
.lr.ph.split.preheader.new: ; preds = %.lr.ph.split.preheader | |
%unroll_iter = sub nsw i32 %47, %xtraiter | |
br label %.lr.ph.split | |
.lr.ph.split.us.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split.us | |
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %.us-lcssa.us.us | |
%.047.us = phi i32 [ %71, %.us-lcssa.us.us ], [ %36, %.lr.ph.split.us.preheader ] | |
%49 = srem i32 %.047.us, %3 | |
%50 = sdiv i32 %.047.us, %3 | |
%51 = srem i32 %50, %31 | |
%52 = shl nsw i32 %51, 4 | |
br label %53 | |
; <label>:53: ; preds = %104, %.lr.ph.split.us | |
%.04346.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %107, %104 ] | |
%.04445.us.us = phi float [ 0.000000e+00, %.lr.ph.split.us ], [ %106, %104 ] | |
%54 = add nuw nsw i32 %.04346.us.us, %52 | |
%55 = icmp slt i32 %54, %2 | |
br i1 %55, label %56, label %62 | |
; <label>:56: ; preds = %53 | |
%57 = mul nsw i32 %54, %3 | |
%58 = add nsw i32 %57, %49 | |
%59 = sext i32 %58 to i64 | |
%60 = getelementptr inbounds float, float* %40, i64 %59 | |
%61 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %60, i32 4) #8 | |
br label %62 | |
; <label>:62: ; preds = %56, %53 | |
%63 = phi float [ %61, %56 ], [ 0.000000e+00, %53 ] | |
%64 = fadd float %.04445.us.us, %63 | |
%65 = or i32 %.04346.us.us, 1 | |
%66 = add nuw nsw i32 %65, %52 | |
%67 = icmp slt i32 %66, %2 | |
br i1 %67, label %98, label %104 | |
.us-lcssa.us.us: ; preds = %104 | |
%.lcssa = phi float [ %106, %104 ] | |
%68 = sext i32 %49 to i64 | |
%69 = getelementptr inbounds float, float* %42, i64 %68 | |
%70 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %69, float %.lcssa) #8 | |
%71 = add nuw nsw i32 %.047.us, 32768 | |
%72 = icmp slt i32 %71, %32 | |
br i1 %72, label %.lr.ph.split.us, label %._crit_edge.loopexit | |
._crit_edge.loopexit: ; preds = %.us-lcssa.us.us | |
br label %._crit_edge | |
._crit_edge.loopexit59.unr-lcssa.loopexit: ; preds = %.lr.ph.split | |
%.lcssa60 = phi i32 [ %97, %.lr.ph.split ] | |
br label %._crit_edge.loopexit59.unr-lcssa | |
._crit_edge.loopexit59.unr-lcssa: ; preds = %._crit_edge.loopexit59.unr-lcssa.loopexit, %.lr.ph.split.preheader | |
%.047.unr = phi i32 [ %36, %.lr.ph.split.preheader ], [ %.lcssa60, %._crit_edge.loopexit59.unr-lcssa.loopexit ] | |
%lcmp.mod = icmp eq i32 %xtraiter, 0 | |
br i1 %lcmp.mod, label %._crit_edge.loopexit59, label %.lr.ph.split.epil.preheader | |
.lr.ph.split.epil.preheader: ; preds = %._crit_edge.loopexit59.unr-lcssa | |
br label %.lr.ph.split.epil | |
.lr.ph.split.epil: ; preds = %.lr.ph.split.epil, %.lr.ph.split.epil.preheader | |
%.047.epil = phi i32 [ %77, %.lr.ph.split.epil ], [ %.047.unr, %.lr.ph.split.epil.preheader ] | |
%epil.iter = phi i32 [ %epil.iter.sub, %.lr.ph.split.epil ], [ %xtraiter, %.lr.ph.split.epil.preheader ] | |
%73 = srem i32 %.047.epil, %3 | |
%74 = sext i32 %73 to i64 | |
%75 = getelementptr inbounds float, float* %42, i64 %74 | |
%76 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %75, float 0.000000e+00) #8 | |
%77 = add nuw nsw i32 %.047.epil, 32768 | |
%epil.iter.sub = add i32 %epil.iter, -1 | |
%epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 | |
br i1 %epil.iter.cmp, label %._crit_edge.loopexit59.epilog-lcssa, label %.lr.ph.split.epil, !llvm.loop !75 | |
._crit_edge.loopexit59.epilog-lcssa: ; preds = %.lr.ph.split.epil | |
br label %._crit_edge.loopexit59 | |
._crit_edge.loopexit59: ; preds = %._crit_edge.loopexit59.unr-lcssa, %._crit_edge.loopexit59.epilog-lcssa | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit59, %._crit_edge.loopexit, %29 | |
ret void | |
.lr.ph.split: ; preds = %.lr.ph.split, %.lr.ph.split.preheader.new | |
%.047 = phi i32 [ %36, %.lr.ph.split.preheader.new ], [ %97, %.lr.ph.split ] | |
%niter = phi i32 [ %unroll_iter, %.lr.ph.split.preheader.new ], [ %niter.nsub.3, %.lr.ph.split ] | |
%78 = srem i32 %.047, %3 | |
%79 = sext i32 %78 to i64 | |
%80 = getelementptr inbounds float, float* %42, i64 %79 | |
%81 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %80, float 0.000000e+00) #8 | |
%82 = add nuw nsw i32 %.047, 32768 | |
%83 = srem i32 %82, %3 | |
%84 = sext i32 %83 to i64 | |
%85 = getelementptr inbounds float, float* %42, i64 %84 | |
%86 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %85, float 0.000000e+00) #8 | |
%87 = add nsw i32 %.047, 65536 | |
%88 = srem i32 %87, %3 | |
%89 = sext i32 %88 to i64 | |
%90 = getelementptr inbounds float, float* %42, i64 %89 | |
%91 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %90, float 0.000000e+00) #8 | |
%92 = add nsw i32 %.047, 98304 | |
%93 = srem i32 %92, %3 | |
%94 = sext i32 %93 to i64 | |
%95 = getelementptr inbounds float, float* %42, i64 %94 | |
%96 = tail call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %95, float 0.000000e+00) #8 | |
%97 = add nsw i32 %.047, 131072 | |
%niter.nsub.3 = add i32 %niter, -4 | |
%niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 | |
br i1 %niter.ncmp.3, label %._crit_edge.loopexit59.unr-lcssa.loopexit, label %.lr.ph.split | |
; <label>:98: ; preds = %62 | |
%99 = mul nsw i32 %66, %3 | |
%100 = add nsw i32 %99, %49 | |
%101 = sext i32 %100 to i64 | |
%102 = getelementptr inbounds float, float* %40, i64 %101 | |
%103 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %102, i32 4) #8 | |
br label %104 | |
; <label>:104: ; preds = %98, %62 | |
%105 = phi float [ %103, %98 ], [ 0.000000e+00, %62 ] | |
%106 = fadd float %64, %105 | |
%107 = add nsw i32 %.04346.us.us, 2 | |
%exitcond.1 = icmp eq i32 %107, 16 | |
br i1 %exitcond.1, label %.us-lcssa.us.us, label %53 | |
} | |
; Function Attrs: convergent nounwind | |
define weak_odr void @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_(%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* byval align 4, %"struct.Eigen::TensorEvaluator.13"* byval align 8, i32, i32, %"struct.Eigen::TensorEvaluator.16"* byval align 8) #2 comdat { | |
%6 = tail call i32 @llvm.ptx.read.ntid.x() #8, !range !47 | |
%7 = icmp eq i32 %6, 256 | |
br i1 %7, label %9, label %8 | |
; <label>:8: ; preds = %5 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str, i64 0, i64 0), i32 93, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:9: ; preds = %5 | |
%10 = tail call i32 @llvm.ptx.read.ntid.y() #8, !range !47 | |
%11 = icmp eq i32 %10, 1 | |
br i1 %11, label %13, label %12 | |
; <label>:12: ; preds = %9 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 94, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:13: ; preds = %9 | |
%14 = tail call i32 @llvm.ptx.read.ntid.z() #8, !range !57 | |
%15 = icmp eq i32 %14, 1 | |
br i1 %15, label %17, label %16 | |
; <label>:16: ; preds = %13 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 95, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:17: ; preds = %13 | |
%18 = tail call i32 @llvm.ptx.read.nctaid.x() #8, !range !49 | |
%19 = icmp eq i32 %18, 128 | |
br i1 %19, label %21, label %20 | |
; <label>:20: ; preds = %17 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.4, i64 0, i64 0), i32 97, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:21: ; preds = %17 | |
%22 = tail call i32 @llvm.ptx.read.nctaid.y() #8, !range !49 | |
%23 = icmp eq i32 %22, 1 | |
br i1 %23, label %25, label %24 | |
; <label>:24: ; preds = %21 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.5, i64 0, i64 0), i32 98, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:25: ; preds = %21 | |
%26 = tail call i32 @llvm.ptx.read.nctaid.z() #8, !range !49 | |
%27 = icmp eq i32 %26, 1 | |
br i1 %27, label %29, label %28 | |
; <label>:28: ; preds = %25 | |
tail call fastcc void @_ZL13__assert_failPKcS0_jS0_(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.6, i64 0, i64 0), i32 99, i8* getelementptr inbounds ([440 x i8], [440 x i8]* @__PRETTY_FUNCTION__._ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, i64 0, i64 0)) #9 | |
unreachable | |
; <label>:29: ; preds = %25 | |
%30 = add nsw i32 %2, 15 | |
%31 = sdiv i32 %30, 16 | |
%32 = mul nsw i32 %31, %3 | |
%33 = tail call i32 @llvm.ptx.read.ctaid.x() #8, !range !46 | |
%34 = tail call i32 @llvm.ptx.read.tid.x() #8, !range !48 | |
%35 = shl nuw nsw i32 %33, 8 | |
%36 = add nuw nsw i32 %35, %34 | |
%37 = icmp slt i32 %36, %32 | |
br i1 %37, label %.lr.ph, label %._crit_edge | |
.lr.ph: ; preds = %29 | |
%.idx45 = getelementptr %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer", %"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"* %0, i64 0, i32 0 | |
%38 = icmp sgt i32 %3, -1 | |
%39 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.13", %"struct.Eigen::TensorEvaluator.13"* %1, i64 0, i32 0 | |
%40 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.16", %"struct.Eigen::TensorEvaluator.16"* %4, i64 0, i32 0 | |
br i1 %38, label %.lr.ph.split.us.preheader, label %.lr.ph.split.preheader | |
.lr.ph.split.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split | |
.lr.ph.split.us.preheader: ; preds = %.lr.ph | |
br label %.lr.ph.split.us | |
.lr.ph.split.us: ; preds = %.lr.ph.split.us.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
%.048.us = phi i32 [ %52, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us ], [ %36, %.lr.ph.split.us.preheader ] | |
%41 = srem i32 %.048.us, %3 | |
%42 = sdiv i32 %.048.us, %3 | |
%43 = srem i32 %42, %31 | |
%44 = shl nsw i32 %43, 4 | |
%.idx45.val.us = load float, float* %.idx45, align 4 | |
%45 = load float*, float** %39, align 8 | |
br label %54 | |
; <label>:46: ; preds = %49, %.us-lcssa.us.us | |
%.011.i.us = phi i32 [ %74, %.us-lcssa.us.us ], [ %51, %49 ] | |
%47 = bitcast i32 %.011.i.us to float | |
%48 = fcmp olt float %47, %.lcssa | |
br i1 %48, label %49, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
; <label>:49: ; preds = %46 | |
%50 = cmpxchg i32* %73, i32 %.011.i.us, i32 %72 seq_cst seq_cst | |
%51 = extractvalue { i32, i1 } %50, 0 | |
%not..i.us = icmp eq i32 %.011.i.us, %51 | |
br i1 %not..i.us, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us, label %46 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us: ; preds = %49, %46 | |
%52 = add nuw nsw i32 %.048.us, 32768 | |
%53 = icmp slt i32 %52, %32 | |
br i1 %53, label %.lr.ph.split.us, label %._crit_edge.loopexit | |
; <label>:54: ; preds = %112, %.lr.ph.split.us | |
%.04347.us.us = phi i32 [ 0, %.lr.ph.split.us ], [ %115, %112 ] | |
%.04446.us.us = phi float [ %.idx45.val.us, %.lr.ph.split.us ], [ %114, %112 ] | |
%55 = add nuw nsw i32 %.04347.us.us, %44 | |
%56 = icmp slt i32 %55, %2 | |
br i1 %56, label %57, label %63 | |
; <label>:57: ; preds = %54 | |
%58 = mul nsw i32 %55, %3 | |
%59 = add nsw i32 %58, %41 | |
%60 = sext i32 %59 to i64 | |
%61 = getelementptr inbounds float, float* %45, i64 %60 | |
%62 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %61, i32 4) #8 | |
br label %63 | |
; <label>:63: ; preds = %54, %57 | |
%64 = phi float [ %62, %57 ], [ %.idx45.val.us, %54 ] | |
%65 = tail call float @llvm.nvvm.fmax.f(float %.04446.us.us, float %64) #8 | |
%66 = or i32 %.04347.us.us, 1 | |
%67 = add nuw nsw i32 %66, %44 | |
%68 = icmp slt i32 %67, %2 | |
br i1 %68, label %106, label %112 | |
.us-lcssa.us.us: ; preds = %112 | |
%.lcssa = phi float [ %114, %112 ] | |
%69 = sext i32 %41 to i64 | |
%70 = load float*, float** %40, align 8 | |
%71 = getelementptr inbounds float, float* %70, i64 %69 | |
%72 = bitcast float %.lcssa to i32 | |
%73 = bitcast float* %71 to i32* | |
%74 = load i32, i32* %73, align 4 | |
br label %46 | |
._crit_edge.loopexit: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit.us | |
br label %._crit_edge | |
._crit_edge.loopexit60: ; preds = %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
br label %._crit_edge | |
._crit_edge: ; preds = %._crit_edge.loopexit60, %._crit_edge.loopexit, %29 | |
ret void | |
.lr.ph.split: ; preds = %.lr.ph.split.preheader, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
%.048 = phi i32 [ %104, %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit ], [ %36, %.lr.ph.split.preheader ] | |
%.idx45.val = load float, float* %.idx45, align 4 | |
%75 = tail call float @llvm.nvvm.fmax.f(float %.idx45.val, float %.idx45.val) #8 | |
%76 = tail call float @llvm.nvvm.fmax.f(float %75, float %.idx45.val) #8 | |
%77 = tail call float @llvm.nvvm.fmax.f(float %76, float %.idx45.val) #8 | |
%78 = tail call float @llvm.nvvm.fmax.f(float %77, float %.idx45.val) #8 | |
%79 = tail call float @llvm.nvvm.fmax.f(float %78, float %.idx45.val) #8 | |
%80 = tail call float @llvm.nvvm.fmax.f(float %79, float %.idx45.val) #8 | |
%81 = tail call float @llvm.nvvm.fmax.f(float %80, float %.idx45.val) #8 | |
%82 = tail call float @llvm.nvvm.fmax.f(float %81, float %.idx45.val) #8 | |
%83 = tail call float @llvm.nvvm.fmax.f(float %82, float %.idx45.val) #8 | |
%84 = tail call float @llvm.nvvm.fmax.f(float %83, float %.idx45.val) #8 | |
%85 = tail call float @llvm.nvvm.fmax.f(float %84, float %.idx45.val) #8 | |
%86 = tail call float @llvm.nvvm.fmax.f(float %85, float %.idx45.val) #8 | |
%87 = tail call float @llvm.nvvm.fmax.f(float %86, float %.idx45.val) #8 | |
%88 = tail call float @llvm.nvvm.fmax.f(float %87, float %.idx45.val) #8 | |
%89 = tail call float @llvm.nvvm.fmax.f(float %88, float %.idx45.val) #8 | |
%90 = tail call float @llvm.nvvm.fmax.f(float %89, float %.idx45.val) #8 | |
%91 = srem i32 %.048, %3 | |
%92 = sext i32 %91 to i64 | |
%93 = load float*, float** %40, align 8 | |
%94 = getelementptr inbounds float, float* %93, i64 %92 | |
%95 = bitcast float %90 to i32 | |
%96 = bitcast float* %94 to i32* | |
%97 = load i32, i32* %96, align 4 | |
br label %98 | |
; <label>:98: ; preds = %101, %.lr.ph.split | |
%.011.i = phi i32 [ %97, %.lr.ph.split ], [ %103, %101 ] | |
%99 = bitcast i32 %.011.i to float | |
%100 = fcmp olt float %99, %90 | |
br i1 %100, label %101, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit | |
; <label>:101: ; preds = %98 | |
%102 = cmpxchg i32* %96, i32 %.011.i, i32 %95 seq_cst seq_cst | |
%103 = extractvalue { i32, i1 } %102, 0 | |
%not..i = icmp eq i32 %.011.i, %103 | |
br i1 %not..i, label %_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit, label %98 | |
_ZN5Eigen8internal12_GLOBAL__N_114CudaMaxReducer13atomic_reduceEPff.exit: ; preds = %98, %101 | |
%104 = add nuw nsw i32 %.048, 32768 | |
%105 = icmp slt i32 %104, %32 | |
br i1 %105, label %.lr.ph.split, label %._crit_edge.loopexit60 | |
; <label>:106: ; preds = %63 | |
%107 = mul nsw i32 %67, %3 | |
%108 = add nsw i32 %107, %41 | |
%109 = sext i32 %108 to i64 | |
%110 = getelementptr inbounds float, float* %45, i64 %109 | |
%111 = tail call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* %110, i32 4) #8 | |
br label %112 | |
; <label>:112: ; preds = %106, %63 | |
%113 = phi float [ %111, %106 ], [ %.idx45.val.us, %63 ] | |
%114 = tail call float @llvm.nvvm.fmax.f(float %65, float %113) #8 | |
%115 = add nsw i32 %.04347.us.us, 2 | |
%exitcond.1 = icmp eq i32 %115, 16 | |
br i1 %exitcond.1, label %.us-lcssa.us.us, label %54 | |
} | |
; Function Attrs: nounwind readnone | |
declare float @llvm.nvvm.fmax.f(float, float) #1 | |
attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { nounwind readnone } | |
attributes #2 = { convergent nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #3 = { convergent nounwind } | |
attributes #4 = { argmemonly nounwind readonly } | |
attributes #5 = { argmemonly nounwind } | |
attributes #6 = { convergent inlinehint noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #7 = { convergent noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx42" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #8 = { nounwind } | |
attributes #9 = { convergent } | |
attributes #10 = { convergent noreturn nounwind } | |
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !38, !40, !40, !40, !40, !41, !41, !40} | |
!llvm.module.flags = !{!42, !43} | |
!llvm.ident = !{!44} | |
!nvvm.internalize.after.link = !{} | |
!nvvmir.version = !{!45} | |
!0 = !{void (float, i32, float*)* @_ZN5Eigen8internal19ReductionInitKernelIfiEEvT_T0_PS2_, !"kernel", i32 1} | |
!1 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!2 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!3 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!4 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!5 = !{void (%"struct.Eigen::TensorEvaluator.5"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!6 = !{void (float, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS0_10PtrWrapperIfiEEEEvfiT_, !"kernel", i32 1} | |
!7 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!8 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!9 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!10 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!11 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!12 = !{void (%"struct.Eigen::TensorEvaluator.6"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!13 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!14 = !{void (%"struct.Eigen::TensorEvaluator.11"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi1EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!15 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1} | |
!16 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!17 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!18 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!19 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.3"*, i32, i32, %"struct.Eigen::TensorEvaluator.7"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi1EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi1EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!20 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, float*)* @_ZN5Eigen8internal19FullReductionKernelILi256ELi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT2_T1_T3_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!21 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20InnerReductionKernelILi128ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!22 = !{void (%"struct.Eigen::internal::SumReducer"*, %"struct.Eigen::TensorEvaluator.12"*, i32, i32, float*)* @_ZN5Eigen8internal20OuterReductionKernelILi16ENS_15TensorEvaluatorIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEENS_9GpuDeviceEEES5_iEEvT1_T0_T2_SK_PNSJ_15CoeffReturnTypeE, !"kernel", i32 1} | |
!23 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!24 = !{void (%"struct.Eigen::TensorEvaluator.14"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorEvalToOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!25 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!26 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!27 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!28 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::internal::PtrWrapper"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS0_10PtrWrapperIfiEENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!29 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!30 = !{void (%"struct.Eigen::TensorEvaluator.15"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_18TensorForcedEvalOpIKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!31 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1} | |
!32 = !{void (%"struct.Eigen::TensorEvaluator.24"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEEKNS_17TensorReductionOpINS0_10SumReducerIfEEKNS_5arrayIiLm1EEEKNS4_INS5_IfLi2ELi0EiEELi0EEEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024} | |
!33 = !{void (float, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_110InitVectorINS_15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi0EiEELi0EEENS_9GpuDeviceEEEEEvfiT_, !"kernel", i32 1} | |
!34 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!35 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_115RowReduceKernelILi32ELi256ELi128ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!36 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaSumReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaSumReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!37 = !{void (%"struct.Eigen::internal::(anonymous namespace)::CudaMaxReducer"*, %"struct.Eigen::TensorEvaluator.13"*, i32, i32, %"struct.Eigen::TensorEvaluator.16"*)* @_ZN5Eigen8internal12_GLOBAL__N_118ColumnReduceKernelILi128ELi256ELi16ENS_15TensorEvaluatorIKNS_9TensorMapINS_6TensorIfLi2ELi0EiEELi0EEENS_9GpuDeviceEEENS3_INS4_INS5_IfLi1ELi0EiEELi0EEES9_EENS1_14CudaMaxReducerEEEvT4_T2_iiT3_, !"kernel", i32 1} | |
!38 = !{null, !"align", i32 8} | |
!39 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} | |
!40 = !{null, !"align", i32 16} | |
!41 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} | |
!42 = !{i32 4, !"nvvm-reflect-ftz", i32 0} | |
!43 = !{i32 1, !"PIC Level", i32 2} | |
!44 = !{!"clang version google3-trunk (trunk r271374)"} | |
!45 = !{i32 1, i32 2} | |
!46 = !{i32 0, i32 65535} | |
!47 = !{i32 1, i32 1025} | |
!48 = !{i32 0, i32 1024} | |
!49 = !{i32 1, i32 65536} | |
!50 = distinct !{!50, !51} | |
!51 = !{!"llvm.loop.unroll.disable"} | |
!52 = distinct !{!52, !51} | |
!53 = !{i32 457534} | |
!54 = distinct !{!54, !55} | |
!55 = !{!"llvm.loop.unroll.enable"} | |
!56 = distinct !{!56, !51} | |
!57 = !{i32 1, i32 65} | |
!58 = distinct !{!58, !55} | |
!59 = distinct !{!59, !55} | |
!60 = distinct !{!60, !51} | |
!61 = distinct !{!61, !51} | |
!62 = distinct !{!62, !55} | |
!63 = distinct !{!63, !55} | |
!64 = distinct !{!64, !51} | |
!65 = distinct !{!65, !51} | |
!66 = distinct !{!66, !51} | |
!67 = distinct !{!67, !55} | |
!68 = distinct !{!68, !51} | |
!69 = distinct !{!69, !55} | |
!70 = distinct !{!70, !55} | |
!71 = distinct !{!71, !51} | |
!72 = distinct !{!72, !51} | |
!73 = distinct !{!73, !55} | |
!74 = distinct !{!74, !55} | |
!75 = distinct !{!75, !51} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment