Skip to content

Instantly share code, notes, and snippets.

@jdoerfert
Created June 13, 2019 22:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdoerfert/4376a251d98171326d625f2fb67b5259 to your computer and use it in GitHub Desktop.
Save jdoerfert/4376a251d98171326d625f2fb67b5259 to your computer and use it in GitHub Desktop.
; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvida-cuda"
%class.omptarget_nvptx_ThreadPrivateContext = type { %class.omptarget_nvptx_TeamDescr, [1024 x %class.omptarget_nvptx_TaskDescr], [1024 x %class.omptarget_nvptx_TaskDescr*], %union.anon, [1024 x i32], [1024 x i64], [1024 x i64], [1024 x i64], [1024 x i64], i64, [8 x i8] }
%class.omptarget_nvptx_TeamDescr = type { %class.omptarget_nvptx_TaskDescr, %class.omptarget_nvptx_WorkDescr, i64, [8 x i8], [32 x %struct.__kmpc_data_sharing_worker_slot_static], [1 x %struct.__kmpc_data_sharing_master_slot_static] }
%class.omptarget_nvptx_TaskDescr = type { %"struct.omptarget_nvptx_TaskDescr::SavedLoopDescr_items", %"struct.omptarget_nvptx_TaskDescr::TaskDescr_items", %class.omptarget_nvptx_TaskDescr* }
%"struct.omptarget_nvptx_TaskDescr::SavedLoopDescr_items" = type { i64, i64, i64, i64, i32 }
%"struct.omptarget_nvptx_TaskDescr::TaskDescr_items" = type { i8, i8, i16, i64 }
%class.omptarget_nvptx_WorkDescr = type { %class.omptarget_nvptx_TaskDescr }
%struct.__kmpc_data_sharing_worker_slot_static = type { %struct.__kmpc_data_sharing_slot*, %struct.__kmpc_data_sharing_slot*, i8*, i8*, [8192 x i8] }
%struct.__kmpc_data_sharing_slot = type { %struct.__kmpc_data_sharing_slot*, %struct.__kmpc_data_sharing_slot*, i8*, i8*, [0 x i8] }
%struct.__kmpc_data_sharing_master_slot_static = type { %struct.__kmpc_data_sharing_slot*, %struct.__kmpc_data_sharing_slot*, i8*, i8*, [256 x i8] }
%union.anon = type { [1024 x i16] }
%struct.DataSharingStateTy = type { [32 x %struct.__kmpc_data_sharing_slot*], [32 x i8*], [32 x i8*], [32 x i32] }
%class.omptarget_nvptx_Queue = type { [32 x %class.omptarget_nvptx_ThreadPrivateContext], [32 x %class.omptarget_nvptx_ThreadPrivateContext*], i32, [32 x i32], i32, [8 x i8] }
@__openmp_nvptx_data_transfer_temporary_storage = common addrspace(3) global [32 x i32] zeroinitializer
@__omp_offloading_10_1012dd1_main_l7_exec_mode = weak constant i8 0
@omptarget_nvptx_threadPrivateContext = external dso_local local_unnamed_addr addrspace(3) global %class.omptarget_nvptx_ThreadPrivateContext*, align 8
@DataSharingState = external dso_local local_unnamed_addr addrspace(3) global %struct.DataSharingStateTy, align 8
@parallelLevel = external dso_local local_unnamed_addr addrspace(3) global [32 x i8], align 1
@execution_param = external dso_local local_unnamed_addr addrspace(3) global i32, align 4
@usedSlotIdx = external dso_local local_unnamed_addr addrspace(3) global i32, align 4
@omptarget_nvptx_device_State = external dso_local addrspace(1) global [84 x %class.omptarget_nvptx_Queue], align 16
@llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_10_1012dd1_main_l7_exec_mode, i8* addrspacecast (i8 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
; Function Attrs: norecurse nounwind
define weak void @__omp_offloading_10_1012dd1_main_l7(i32* dereferenceable(4) %Count) local_unnamed_addr #0 {
entry:
store i32 1, i32* addrspacecast (i32 addrspace(3)* @execution_param to i32*), align 4, !tbaa !13
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5, !range !17
%cmp.i = icmp eq i32 %0, 0
br i1 %cmp.i, label %if.then19.i, label %if.else.i
if.else.i: ; preds = %entry
%and.i.i = and i32 %0, 31
%cmp6.i = icmp eq i32 %and.i.i, 0
br i1 %cmp6.i, label %if.then7.i, label %__kmpc_spmd_kernel_init.exit
if.then7.i: ; preds = %if.else.i
%div.i.i = lshr i32 %0, 5
%idxprom.i = zext i32 %div.i.i to i64
%arrayidx64.i = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* @parallelLevel, i64 0, i64 %idxprom.i
%arrayidx.i = addrspacecast i8 addrspace(3)* %arrayidx64.i to i8*
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !18
%cmp9.i = icmp ugt i32 %1, 1
%add11.i = select i1 %cmp9.i, i8 -127, i8 1
store i8 %add11.i, i8* %arrayidx.i, align 1, !tbaa !19
br label %__kmpc_spmd_kernel_init.exit
if.then19.i: ; preds = %entry
%2 = tail call i32 asm "mov.u32 $0, %smid;", "=r"() #6, !srcloc !20
%rem.i = urem i32 %2, 84
store i32 %rem.i, i32* addrspacecast (i32 addrspace(3)* @usedSlotIdx to i32*), align 4, !tbaa !13
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !18
%cmp3.i = icmp ugt i32 %3, 1
%add.i = select i1 %cmp3.i, i8 -127, i8 1
store i8 %add.i, i8* getelementptr inbounds ([32 x i8], [32 x i8]* addrspacecast ([32 x i8] addrspace(3)* @parallelLevel to [32 x i8]*), i64 0, i64 0), align 1, !tbaa !19
%idxprom20.i = zext i32 %rem.i to i64
%arrayidx2163.i = getelementptr inbounds [84 x %class.omptarget_nvptx_Queue], [84 x %class.omptarget_nvptx_Queue] addrspace(1)* @omptarget_nvptx_device_State, i64 0, i64 %idxprom20.i
%arrayidx21.i = addrspacecast %class.omptarget_nvptx_Queue addrspace(1)* %arrayidx2163.i to %class.omptarget_nvptx_Queue*
%head.i.i.i = getelementptr inbounds %class.omptarget_nvptx_Queue, %class.omptarget_nvptx_Queue* %arrayidx21.i, i64 0, i32 2
%4 = atomicrmw add i32* %head.i.i.i, i32 1 seq_cst
%rem.i.i = and i32 %4, 31
%div.i.i.i = lshr i32 %4, 5
%mul.i.i.i = shl nuw nsw i32 %div.i.i.i, 1
%idxprom.i3.i.i = zext i32 %rem.i.i to i64
%arrayidx.i4.i.i = getelementptr inbounds %class.omptarget_nvptx_Queue, %class.omptarget_nvptx_Queue* %arrayidx21.i, i64 0, i32 3, i64 %idxprom.i3.i.i
br label %while.cond.i.i
while.cond.i.i: ; preds = %while.cond.i.i, %if.then19.i
%5 = atomicrmw or i32* %arrayidx.i4.i.i, i32 0 seq_cst
%cmp.i.i.i = icmp eq i32 %5, %mul.i.i.i
br i1 %cmp.i.i.i, label %_ZN21omptarget_nvptx_QueueI36omptarget_nvptx_ThreadPrivateContextLj32EE7DequeueEv.exit.i, label %while.cond.i.i
_ZN21omptarget_nvptx_QueueI36omptarget_nvptx_ThreadPrivateContextLj32EE7DequeueEv.exit.i: ; preds = %while.cond.i.i
%arrayidx.i2.i.i = getelementptr inbounds %class.omptarget_nvptx_Queue, %class.omptarget_nvptx_Queue* %arrayidx21.i, i64 0, i32 1, i64 %idxprom.i3.i.i
%6 = bitcast %class.omptarget_nvptx_ThreadPrivateContext** %arrayidx.i2.i.i to i64*
%7 = atomicrmw or i64* %6, i64 0 seq_cst
%8 = inttoptr i64 %7 to %class.omptarget_nvptx_ThreadPrivateContext*
%cmp.i.i = icmp eq i64 %7, 0
%arrayidx.i4.i = getelementptr inbounds %class.omptarget_nvptx_Queue, %class.omptarget_nvptx_Queue* %arrayidx21.i, i64 0, i32 0, i64 %idxprom.i3.i.i
%element.0.i.i = select i1 %cmp.i.i, %class.omptarget_nvptx_ThreadPrivateContext* %arrayidx.i4.i, %class.omptarget_nvptx_ThreadPrivateContext* %8
%add.i.i.i = and i32 %mul.i.i.i, 33554430
%rem.i.i.i = or i32 %add.i.i.i, 1
%9 = atomicrmw xchg i32* %arrayidx.i4.i.i, i32 %rem.i.i.i seq_cst
store %class.omptarget_nvptx_ThreadPrivateContext* %element.0.i.i, %class.omptarget_nvptx_ThreadPrivateContext** addrspacecast (%class.omptarget_nvptx_ThreadPrivateContext* addrspace(3)* @omptarget_nvptx_threadPrivateContext to %class.omptarget_nvptx_ThreadPrivateContext**), align 8, !tbaa !21
%flags.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %element.0.i.i, i64 0, i32 0, i32 0, i32 1, i32 0
store i8 0, i8* %flags.i.i.i, align 8, !tbaa !23
%threadId.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %element.0.i.i, i64 0, i32 0, i32 0, i32 1, i32 2
store i16 0, i16* %threadId.i.i.i, align 2, !tbaa !30
%runtimeChunkSize.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %element.0.i.i, i64 0, i32 0, i32 0, i32 1, i32 3
store i64 1, i64* %runtimeChunkSize.i.i.i, align 8, !tbaa !31
br label %__kmpc_spmd_kernel_init.exit
__kmpc_spmd_kernel_init.exit: ; preds = %if.else.i, %if.then7.i, %_ZN21omptarget_nvptx_QueueI36omptarget_nvptx_ThreadPrivateContextLj32EE7DequeueEv.exit.i
tail call void asm sideeffect "bar.sync $0;", "r,~{memory}"(i32 0) #7, !srcloc !32
%10 = load %class.omptarget_nvptx_ThreadPrivateContext*, %class.omptarget_nvptx_ThreadPrivateContext** addrspacecast (%class.omptarget_nvptx_ThreadPrivateContext* addrspace(3)* @omptarget_nvptx_threadPrivateContext to %class.omptarget_nvptx_ThreadPrivateContext**), align 8, !tbaa !21
%idxprom.i1.i = zext i32 %0 to i64
%arrayidx.i2.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %10, i64 0, i32 1, i64 %idxprom.i1.i
%levelZeroTaskDescr.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %10, i64 0, i32 0, i32 0
%flags.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %10, i64 0, i32 1, i64 %idxprom.i1.i, i32 1, i32 0
store i8 48, i8* %flags.i.i, align 8, !tbaa !23
%conv.i.i = trunc i32 %0 to i16
%threadId.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %10, i64 0, i32 1, i64 %idxprom.i1.i, i32 1, i32 2
store i16 %conv.i.i, i16* %threadId.i.i, align 2, !tbaa !30
%runtimeChunkSize.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %10, i64 0, i32 1, i64 %idxprom.i1.i, i32 1, i32 3
store i64 1, i64* %runtimeChunkSize.i.i, align 8, !tbaa !31
%prev.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %10, i64 0, i32 1, i64 %idxprom.i1.i, i32 2
store %class.omptarget_nvptx_TaskDescr* %levelZeroTaskDescr.i.i, %class.omptarget_nvptx_TaskDescr** %prev.i.i, align 8, !tbaa !33
%11 = load %class.omptarget_nvptx_ThreadPrivateContext*, %class.omptarget_nvptx_ThreadPrivateContext** addrspacecast (%class.omptarget_nvptx_ThreadPrivateContext* addrspace(3)* @omptarget_nvptx_threadPrivateContext to %class.omptarget_nvptx_ThreadPrivateContext**), align 8, !tbaa !21
%arrayidx.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %11, i64 0, i32 2, i64 %idxprom.i1.i
store %class.omptarget_nvptx_TaskDescr* %arrayidx.i2.i, %class.omptarget_nvptx_TaskDescr** %arrayidx.i.i, align 8, !tbaa !21
br i1 %cmp.i, label %if.then.i, label %__kmpc_data_sharing_init_stack_spmd.exit
if.then.i: ; preds = %__kmpc_spmd_kernel_init.exit
%12 = load %class.omptarget_nvptx_ThreadPrivateContext*, %class.omptarget_nvptx_ThreadPrivateContext** addrspacecast (%class.omptarget_nvptx_ThreadPrivateContext* addrspace(3)* @omptarget_nvptx_threadPrivateContext to %class.omptarget_nvptx_ThreadPrivateContext**), align 8, !tbaa !21
br label %for.body.i.i
for.body.i.i: ; preds = %for.body.i.i, %if.then.i
%WID.013.i.i = phi i32 [ 0, %if.then.i ], [ %inc.i.i.3, %for.body.i.i ]
%idxprom.i.i.i = zext i32 %WID.013.i.i to i64
%arrayidx.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i
%add.ptr.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i, i32 4, i64 8192
%DataEnd.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i, i32 3
store i8* %add.ptr.i.i.i, i8** %DataEnd.i.i.i, align 8, !tbaa !34
%Next.i.i.i = getelementptr inbounds %struct.__kmpc_data_sharing_worker_slot_static, %struct.__kmpc_data_sharing_worker_slot_static* %arrayidx.i.i.i, i64 0, i32 0
store %struct.__kmpc_data_sharing_slot* null, %struct.__kmpc_data_sharing_slot** %Next.i.i.i, align 16, !tbaa !36
%Prev.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i, i32 1
store %struct.__kmpc_data_sharing_slot* null, %struct.__kmpc_data_sharing_slot** %Prev.i.i.i, align 8, !tbaa !37
%PrevSlotStackPtr.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i, i32 2
store i8* null, i8** %PrevSlotStackPtr.i.i.i, align 16, !tbaa !38
%arrayidx.i.i2 = getelementptr %struct.DataSharingStateTy, %struct.DataSharingStateTy* addrspacecast (%struct.DataSharingStateTy addrspace(3)* @DataSharingState to %struct.DataSharingStateTy*), i64 0, i32 0, i64 %idxprom.i.i.i
%13 = bitcast %struct.__kmpc_data_sharing_slot** %arrayidx.i.i2 to %struct.__kmpc_data_sharing_worker_slot_static**
store %struct.__kmpc_data_sharing_worker_slot_static* %arrayidx.i.i.i, %struct.__kmpc_data_sharing_worker_slot_static** %13, align 8, !tbaa !21
%14 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i, i32 4, i64 0
%arrayidx4.i.i = getelementptr %struct.DataSharingStateTy, %struct.DataSharingStateTy* addrspacecast (%struct.DataSharingStateTy addrspace(3)* @DataSharingState to %struct.DataSharingStateTy*), i64 0, i32 1, i64 %idxprom.i.i.i
store i8* %14, i8** %arrayidx4.i.i, align 8, !tbaa !21
%inc.i.i = or i32 %WID.013.i.i, 1
%idxprom.i.i.i.1 = zext i32 %inc.i.i to i64
%arrayidx.i.i.i.1 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.1
%add.ptr.i.i.i.1 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.1, i32 4, i64 8192
%DataEnd.i.i.i.1 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.1, i32 3
store i8* %add.ptr.i.i.i.1, i8** %DataEnd.i.i.i.1, align 8, !tbaa !34
%Next.i.i.i.1 = getelementptr inbounds %struct.__kmpc_data_sharing_worker_slot_static, %struct.__kmpc_data_sharing_worker_slot_static* %arrayidx.i.i.i.1, i64 0, i32 0
store %struct.__kmpc_data_sharing_slot* null, %struct.__kmpc_data_sharing_slot** %Next.i.i.i.1, align 16, !tbaa !36
%Prev.i.i.i.1 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.1, i32 1
store %struct.__kmpc_data_sharing_slot* null, %struct.__kmpc_data_sharing_slot** %Prev.i.i.i.1, align 8, !tbaa !37
%PrevSlotStackPtr.i.i.i.1 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.1, i32 2
store i8* null, i8** %PrevSlotStackPtr.i.i.i.1, align 16, !tbaa !38
%arrayidx.i.i2.1 = getelementptr %struct.DataSharingStateTy, %struct.DataSharingStateTy* addrspacecast (%struct.DataSharingStateTy addrspace(3)* @DataSharingState to %struct.DataSharingStateTy*), i64 0, i32 0, i64 %idxprom.i.i.i.1
%15 = bitcast %struct.__kmpc_data_sharing_slot** %arrayidx.i.i2.1 to %struct.__kmpc_data_sharing_worker_slot_static**
store %struct.__kmpc_data_sharing_worker_slot_static* %arrayidx.i.i.i.1, %struct.__kmpc_data_sharing_worker_slot_static** %15, align 8, !tbaa !21
%16 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.1, i32 4, i64 0
%arrayidx4.i.i.1 = getelementptr %struct.DataSharingStateTy, %struct.DataSharingStateTy* addrspacecast (%struct.DataSharingStateTy addrspace(3)* @DataSharingState to %struct.DataSharingStateTy*), i64 0, i32 1, i64 %idxprom.i.i.i.1
store i8* %16, i8** %arrayidx4.i.i.1, align 8, !tbaa !21
%inc.i.i.1 = or i32 %WID.013.i.i, 2
%idxprom.i.i.i.2 = zext i32 %inc.i.i.1 to i64
%arrayidx.i.i.i.2 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.2
%add.ptr.i.i.i.2 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.2, i32 4, i64 8192
%DataEnd.i.i.i.2 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.2, i32 3
store i8* %add.ptr.i.i.i.2, i8** %DataEnd.i.i.i.2, align 8, !tbaa !34
%Next.i.i.i.2 = getelementptr inbounds %struct.__kmpc_data_sharing_worker_slot_static, %struct.__kmpc_data_sharing_worker_slot_static* %arrayidx.i.i.i.2, i64 0, i32 0
store %struct.__kmpc_data_sharing_slot* null, %struct.__kmpc_data_sharing_slot** %Next.i.i.i.2, align 16, !tbaa !36
%Prev.i.i.i.2 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.2, i32 1
store %struct.__kmpc_data_sharing_slot* null, %struct.__kmpc_data_sharing_slot** %Prev.i.i.i.2, align 8, !tbaa !37
%PrevSlotStackPtr.i.i.i.2 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.2, i32 2
store i8* null, i8** %PrevSlotStackPtr.i.i.i.2, align 16, !tbaa !38
%arrayidx.i.i2.2 = getelementptr %struct.DataSharingStateTy, %struct.DataSharingStateTy* addrspacecast (%struct.DataSharingStateTy addrspace(3)* @DataSharingState to %struct.DataSharingStateTy*), i64 0, i32 0, i64 %idxprom.i.i.i.2
%17 = bitcast %struct.__kmpc_data_sharing_slot** %arrayidx.i.i2.2 to %struct.__kmpc_data_sharing_worker_slot_static**
store %struct.__kmpc_data_sharing_worker_slot_static* %arrayidx.i.i.i.2, %struct.__kmpc_data_sharing_worker_slot_static** %17, align 8, !tbaa !21
%18 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.2, i32 4, i64 0
%arrayidx4.i.i.2 = getelementptr %struct.DataSharingStateTy, %struct.DataSharingStateTy* addrspacecast (%struct.DataSharingStateTy addrspace(3)* @DataSharingState to %struct.DataSharingStateTy*), i64 0, i32 1, i64 %idxprom.i.i.i.2
store i8* %18, i8** %arrayidx4.i.i.2, align 8, !tbaa !21
%inc.i.i.2 = or i32 %WID.013.i.i, 3
%idxprom.i.i.i.3 = zext i32 %inc.i.i.2 to i64
%arrayidx.i.i.i.3 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.3
%add.ptr.i.i.i.3 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.3, i32 4, i64 8192
%DataEnd.i.i.i.3 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.3, i32 3
store i8* %add.ptr.i.i.i.3, i8** %DataEnd.i.i.i.3, align 8, !tbaa !34
%Next.i.i.i.3 = getelementptr inbounds %struct.__kmpc_data_sharing_worker_slot_static, %struct.__kmpc_data_sharing_worker_slot_static* %arrayidx.i.i.i.3, i64 0, i32 0
store %struct.__kmpc_data_sharing_slot* null, %struct.__kmpc_data_sharing_slot** %Next.i.i.i.3, align 16, !tbaa !36
%Prev.i.i.i.3 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.3, i32 1
store %struct.__kmpc_data_sharing_slot* null, %struct.__kmpc_data_sharing_slot** %Prev.i.i.i.3, align 8, !tbaa !37
%PrevSlotStackPtr.i.i.i.3 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.3, i32 2
store i8* null, i8** %PrevSlotStackPtr.i.i.i.3, align 16, !tbaa !38
%arrayidx.i.i2.3 = getelementptr %struct.DataSharingStateTy, %struct.DataSharingStateTy* addrspacecast (%struct.DataSharingStateTy addrspace(3)* @DataSharingState to %struct.DataSharingStateTy*), i64 0, i32 0, i64 %idxprom.i.i.i.3
%19 = bitcast %struct.__kmpc_data_sharing_slot** %arrayidx.i.i2.3 to %struct.__kmpc_data_sharing_worker_slot_static**
store %struct.__kmpc_data_sharing_worker_slot_static* %arrayidx.i.i.i.3, %struct.__kmpc_data_sharing_worker_slot_static** %19, align 8, !tbaa !21
%20 = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %12, i64 0, i32 0, i32 4, i64 %idxprom.i.i.i.3, i32 4, i64 0
%arrayidx4.i.i.3 = getelementptr %struct.DataSharingStateTy, %struct.DataSharingStateTy* addrspacecast (%struct.DataSharingStateTy addrspace(3)* @DataSharingState to %struct.DataSharingStateTy*), i64 0, i32 1, i64 %idxprom.i.i.i.3
store i8* %20, i8** %arrayidx4.i.i.3, align 8, !tbaa !21
%inc.i.i.3 = add nuw nsw i32 %WID.013.i.i, 4
%exitcond.i.i.3 = icmp eq i32 %inc.i.i.3, 32
br i1 %exitcond.i.i.3, label %__kmpc_data_sharing_init_stack_spmd.exit, label %for.body.i.i
__kmpc_data_sharing_init_stack_spmd.exit: ; preds = %for.body.i.i, %__kmpc_spmd_kernel_init.exit
tail call void @llvm.nvvm.membar.cta() #5
%div.i.i.i3 = lshr i32 %0, 5
%idxprom.i.i = zext i32 %div.i.i.i3 to i64
%arrayidx10.i.i = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* @parallelLevel, i64 0, i64 %idxprom.i.i
%arrayidx.i.i4 = addrspacecast i8 addrspace(3)* %arrayidx10.i.i to i8*
%21 = load i8, i8* %arrayidx.i.i4, align 1, !tbaa !19
%22 = and i8 %21, 126
%cmp.i.i5 = icmp eq i8 %22, 0
%cmp.i28.i.i.i = icmp eq i8 %21, -127
br i1 %cmp.i28.i.i.i, label %_Z21GetNumberOfOmpThreadsb.exit.i.i.i, label %_Z21GetNumberOfOmpThreadsb.exit.thread.i.i.i
_Z21GetNumberOfOmpThreadsb.exit.thread.i.i.i: ; preds = %__kmpc_data_sharing_init_stack_spmd.exit
%23 = load %class.omptarget_nvptx_ThreadPrivateContext*, %class.omptarget_nvptx_ThreadPrivateContext** addrspacecast (%class.omptarget_nvptx_ThreadPrivateContext* addrspace(3)* @omptarget_nvptx_threadPrivateContext to %class.omptarget_nvptx_ThreadPrivateContext**), align 8, !tbaa !21
%arrayidx.i2541.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %23, i64 0, i32 4, i64 %idxprom.i1.i
br label %if.then32.i.i.i
_Z21GetNumberOfOmpThreadsb.exit.i.i.i: ; preds = %__kmpc_data_sharing_init_stack_spmd.exit
%24 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !18
%cmp.i.i.i17 = icmp eq i32 %24, 1
%25 = load %class.omptarget_nvptx_ThreadPrivateContext*, %class.omptarget_nvptx_ThreadPrivateContext** addrspacecast (%class.omptarget_nvptx_ThreadPrivateContext* addrspace(3)* @omptarget_nvptx_threadPrivateContext to %class.omptarget_nvptx_ThreadPrivateContext**), align 8, !tbaa !21
%arrayidx.i25.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %25, i64 0, i32 4, i64 %idxprom.i1.i
br i1 %cmp.i.i.i17, label %if.then32.i.i.i, label %if.then83.i.i.i
if.then32.i.i.i: ; preds = %_Z21GetNumberOfOmpThreadsb.exit.i.i.i, %_Z21GetNumberOfOmpThreadsb.exit.thread.i.i.i
%arrayidx.i2544.i.i.i = phi i32* [ %arrayidx.i25.i.i.i, %_Z21GetNumberOfOmpThreadsb.exit.i.i.i ], [ %arrayidx.i2541.i.i.i, %_Z21GetNumberOfOmpThreadsb.exit.thread.i.i.i ]
%26 = phi %class.omptarget_nvptx_ThreadPrivateContext* [ %25, %_Z21GetNumberOfOmpThreadsb.exit.i.i.i ], [ %23, %_Z21GetNumberOfOmpThreadsb.exit.thread.i.i.i ]
store i32 33, i32* %arrayidx.i2544.i.i.i, align 4, !tbaa !39
%arrayidx.i23.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %26, i64 0, i32 6, i64 %idxprom.i1.i
store i64 999, i64* %arrayidx.i23.i.i.i, align 8, !tbaa !40
%.op = mul nuw nsw
%arrayidx.i18.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %26, i64 0, i32 7, i64 %idxprom.i1.i
store i64 %conv38.i.i.i, i64* %arrayidx.i18.i.i.i, align 8, !tbaa !40
%arrayidx.i16.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %26, i64 0, i32 8, i64 %idxprom.i1.i
store i64 1000, i64* %arrayidx.i16.i.i.i, align 8, !tbaa !40
br label %omp.dispatch.cond.i.preheader
if.then83.i.i.i: ; preds = %_Z21GetNumberOfOmpThreadsb.exit.i.i.i
store i32 35, i32* %arrayidx.i25.i.i.i, align 4, !tbaa !39
%arrayidx.i14.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %25, i64 0, i32 5, i64 %idxprom.i1.i
store i64 2, i64* %arrayidx.i14.i.i.i, align 8, !tbaa !40
%arrayidx.i12.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %25, i64 0, i32 6, i64 %idxprom.i1.i
store i64 999, i64* %arrayidx.i12.i.i.i, align 8, !tbaa !40
%arrayidx.i.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %25, i64 0, i32 7, i64 %idxprom.i1.i
store i64 0, i64* %arrayidx.i.i.i.i, align 8, !tbaa !40
%28 = load i8, i8* %arrayidx.i.i4, align 1, !tbaa !19
%cmp.i.i5.i.i.i = icmp eq i8 %28, -127
br i1 %cmp.i.i5.i.i.i, label %if.then5.i9.i.i.i, label %__kmpc_barrier.exit10.i.i.i
if.then5.i9.i.i.i: ; preds = %if.then83.i.i.i
tail call void asm sideeffect "bar.sync $0;", "r,~{memory}"(i32 0) #7, !srcloc !41
br label %__kmpc_barrier.exit10.i.i.i
__kmpc_barrier.exit10.i.i.i: ; preds = %if.then5.i9.i.i.i, %if.then83.i.i.i
br i1 %cmp.i, label %if.then95.i.i.i, label %if.end97.i.i.i
if.then95.i.i.i: ; preds = %__kmpc_barrier.exit10.i.i.i
%29 = load %class.omptarget_nvptx_ThreadPrivateContext*, %class.omptarget_nvptx_ThreadPrivateContext** addrspacecast (%class.omptarget_nvptx_ThreadPrivateContext* addrspace(3)* @omptarget_nvptx_threadPrivateContext to %class.omptarget_nvptx_ThreadPrivateContext**), align 8, !tbaa !21
%cnt.i.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %29, i64 0, i32 9
store i64 0, i64* %cnt.i.i.i.i, align 8, !tbaa !40
tail call void @llvm.nvvm.membar.cta() #5
br label %if.end97.i.i.i
if.end97.i.i.i: ; preds = %if.then95.i.i.i, %__kmpc_barrier.exit10.i.i.i
%30 = load i8, i8* %arrayidx.i.i4, align 1, !tbaa !19
%cmp.i.i.i.i.i = icmp eq i8 %30, -127
%cmp.i.i.i.i = icmp ugt i32 %24, 1
%or.cond45.i.i.i = and i1 %cmp.i.i.i.i, %cmp.i.i.i.i.i
br i1 %or.cond45.i.i.i, label %if.then5.i.i.i.i, label %omp.dispatch.cond.i.preheader
if.then5.i.i.i.i: ; preds = %if.end97.i.i.i
tail call void asm sideeffect "bar.sync $0;", "r,~{memory}"(i32 0) #7, !srcloc !41
br label %omp.dispatch.cond.i.preheader
omp.dispatch.cond.i.preheader: ; preds = %if.then5.i.i.i.i, %if.end97.i.i.i, %if.then32.i.i.i
br label %omp.dispatch.cond.i.outer
omp.dispatch.cond.i.outer: ; preds = %omp.dispatch.cond.i.preheader, %omp.inner.for.body.preheader.i
%Count1.0.i.ph = phi i32 [ 0, %omp.dispatch.cond.i.preheader ], [ %67, %omp.inner.for.body.preheader.i ]
br label %omp.dispatch.cond.i
omp.dispatch.cond.i: ; preds = %omp.dispatch.cond.i.outer, %omp.dispatch.body.i
%31 = l
%32 = load i32, i32* %arrayidx.i.i.i1.i, align 4, !tbaa !39
%33 = add i32 %32, -33
%34 = icmp ult i32 %33, 2
br i1 %34, label %if.then7.i.i.i, label %if.end27.i.i.i
if.then7.i.i.i: ; preds = %omp.dispatch.cond.i
%arrayidx.i19.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %31, i64 0, i32 7, i64 %idxprom.i1.i
%35 = load i64, i64* %arrayidx.i19.i.i.i, align 8, !tbaa !40
%conv.i.i.i = trunc i64 %35 to i32
%arrayidx.i17.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %31, i64 0, i32 6, i64 %idxprom.i1.i
%36 = load i64, i64* %arrayidx.i17.i.i.i, align 8, !tbaa !40
%conv10.i.i.i = trunc i64 %36 to i32
%cmp11.i.i.i = icmp sgt i32 %conv.i.i.i, %conv10.i.i.i
br i1 %cmp11.i.i.i, label %omp.dispatch.end.i, label %if.end13.i.i.i
if.end13.i.i.i: ; preds = %if.then7.i.i.i
%arrayidx.i15.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %31, i64 0, i32 5, i64 %idxprom.i1.i
%37 = load i64, i64* %arrayidx.i15.i.i.i, align 8, !tbaa !40
%conv15.i.i.i = trunc i64 %37 to i32
%add.i.i.i18 = add i32 %conv.i.i.i, -1
%sub.i.i.i = add i32 %add.i.i.i18, %conv15.i.i.i
%cmp16.i.i.i = icmp sgt i32 %sub.i.i.i, %conv10.i.i.i
%spec.select.i.i.i = select i1 %cmp16.i.i.i, i32 %conv10.i.i.i, i32 %sub.i.i.i
%arrayidx.i13.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %31, i64 0, i32 8, i64 %idxprom.i1.i
%38 = load i64, i64* %arrayidx.i13.i.i.i, align 8, !tbaa !40
%add23.i.i.i = add i64 %38, %35
%sext.i.i.i = shl i64 %add23.i.i.i, 32
%conv24.i.i.i = ashr exact i64 %sext.i.i.i, 32
store i64 %conv24.i.i.i, i64* %arrayidx.i19.i.i.i, align 8, !tbaa !40
br label %omp.dispatch.body.i
if.end27.i.i.i: ; preds = %omp.dispatch.cond.i
%arrayidx.i9.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %31, i64 0, i32 5, i64 %idxprom.i1.i
%39 = load i64, i64* %arrayidx.i9.i.i.i, align 8, !tbaa !40
%conv31.i.i.i = trunc i64 %39 to i32
%arrayidx.i7.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %31, i64 0, i32 7, i64 %idxprom.i1.i
%40 = load i64, i64* %arrayidx.i7.i.i.i, align 8, !tbaa !40
%conv33.i.i.i = trunc i64 %40 to i32
%arrayidx.i5.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext, %class.omptarget_nvptx_ThreadPrivateContext* %31, i64 0, i32 6, i64 %idxprom.i1.i
%41 = load i64, i64* %arrayidx.i5.i.i.i, align 8, !tbaa !40
%conv35.i.i.i = trunc i64 %41 to i32
%42 = tail call i32 @llvm.nvvm.vote.ballot(i1 true) #5
%neg.i.i.i.i.i.i.i = sub i32 0, %42
%and.i.i.i.i.i.i.i = and i32 %42, %neg.i.i.i.i.i.i.i
%43 = tail call i32 @llvm.ctlz.i32(i32 %and.i.i.i.i.i.i.i, i1 false) #5, !range !42
%sub.i.i.i.i.i = sub nsw i32 31, %43
%44 = tail call i32 asm "mov.u32 $0, %lanemask_lt;", "=r"() #6, !srcloc !43
%and.i.i.i.i.i = and i32 %44, %42
%45 = tail call i32 @llvm.ctpop.i32(i32 %and.i.i.i.i.i) #5, !range !42
%cmp.i.i.i.i2.i = icmp eq i32 %and.i.i.i.i.i, 0
br i1 %cmp.i.i.i.i2.i, label %if.then.i.i.i.i.i, label %_ZN27omptarget_nvptx_LoopSupportIiiE8NextIterEv.exit.i.i.i.i
if.then.i.i.i.i.i: ; preds = %if.end27.i.i.
%add1.i.i.i.i = add nsw i32 %add.i.i.i.i, %conv31.i.i.i
%cmp.i.i.i3.i = icmp sgt i32 %add.i.i.i.i, %conv35.i.i.i
%cmp2.i.i.i.i = icmp sgt i32 %add1.i.i.i.i, %conv35.i.i.i
%or.cond.i.i.i.i = or i1 %cmp.i.i.i3.i, %cmp2.i.i.i.i
%sub.i.i.i.i = add nsw i32 %add1.i.i.i.i, -1
%sub.i.mux.i.i.i = select i1 %or.cond.i.i.i.i, i32 %conv35.i.i.i, i32 %sub.i.i.i.i
br i1 %cmp.i.i.i3.i, label %omp.dispatch.end.i, label %omp.dispatch.body.i
omp.dispatch.body.i: ; preds = %_ZN27omptarget_nvptx_LoopSupportIiiE8NextIterEv.exit.i.i.i.i, %if.end13.i.i.i
%.omp.ub.1.i = phi i32 [ %spec.select.i.i.i, %if.end13.i.i.i ], [ %sub.i.mux.i.i.i, %_ZN27omptarget_nvptx_LoopSupportIiiE8NextIterEv.exit.i.i.i.i ]
%.omp.lb.1.i = phi i32 [ %conv.i.i.i, %if.end13.i.i.i ], [ %add.i.i.i.i, %_ZN27omptarget_nvptx_LoopSupportIiiE8NextIterEv.exit.i.i.i.i ]
%cmp25.i = icmp sgt i32 %.omp.lb.1.i, %.omp.ub.1.i
br i1 %cmp25.i, label %omp.dispatch.cond.i, label %omp.inner.for.body.preheader.i
omp.inner.for.body.preheader.i: ; preds = %omp.dispatch.body.i
%54 = icmp sgt i32 %.omp.ub.1.i, %.omp.lb.1.i
%smax.i = select i1 %54, i32 %.omp.ub.1.i, i32 %.omp.lb.1.i
%55 = sub i32 %smax.i, %.omp.lb.1.i
%56 = add i32 %.omp.lb.1.i, 1
%57 = mul i32 %55, %56
%58 = xor i32 %.omp.lb.1.i, -1
%59 = add i32 %smax.i, %58
%60 = zext i32 %59 to i33
%61 = zext i32 %55 to i33
%62 = mul i33 %60, %61
%63 = lshr i33 %62, 1
%64 = trunc i33 %63 to i32
%65 = add i32 %.omp.lb.1.i, %Count1.0.i.ph
%66 = add i32 %65, %57
%67 = add i32 %66, %64
br label %omp.dispatch.cond.i.outer
omp.dispatch.end.i: ; preds = %_ZN27omptarget_nvptx_LoopSupportIiiE8NextIterEv.exit.i.i.i.i, %if.then7.i.i.i
%68 = load i8, i8* %arrayidx.i.i4, align 1, !tbaa !19
%cmp.i2.i.i.i = icmp eq i8 %68, -127
br i1 %cmp.i2.i.i.i, label %if.else.i3.i.i.i, label %.omp.reduction.then.i
if.else.i3.i.i.i: ; preds = %omp.dispatch.end.i
%69 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5, !range !18
%cmp.i.i8.i = icmp eq i32 %69, 1
br i1 %cmp.i.i8.i, label %.omp.reduction.then.i, label %if.end.i.i.i
if.end.i.i.i: ; preds = %if.else.i3.i.i.i
%sub.i.i9.i = add nuw nsw i32 %69, 31
%div.i.i.i19 = lshr i32 %sub.i.i9.i, 5
%rem.i.i.i20 = and i32 %69, 31
%cmp5.i.i.i = icmp eq i32 %rem.i.i.i20, 0
%sub6.i.i.i = add nsw i32 %div.i.i.i19, -1
%cmp7.i.i.i = icmp ult i32 %div.i.i.i3, %sub6.i.i.i
%or.cond.i.i.i = or i1 %cmp5.i.i.i, %cmp7.i.i.i
br i1 %or.cond.i.i.i, label %if.then8.i.i.i, label %if.else.i.i.i
if.then8.i.i.i:
%add.i.i.4.i.i.i.i = add nsw i32 %74, %add.i.i.3.i.i.i.i
br label %if.end15.i.i.i
if.else.i.i.i: ; preds = %if.end.i.i.i
%cmp9.i.i.i = icmp ugt i32 %69, 1
br i1 %cmp9.i.i.i, label %if.then10.i.i.i, label %__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i
if.then10.i.i.i: ; preds = %if.else.i.i.i
%div.i5.i.i.i = lshr i32 %rem.i.i.i20, 1
%cmp9.i6.i.i.i = icmp eq i32 %div.i5.i.i.i, 0
br i1 %cmp9.i6.i.i.i, label %if.end15.i.i.i, label %while.body.lr.ph.i8.i.i.i
while.body.lr.ph.i8.i.i.i: ; preds = %if.then10.i.i.i
%conv.i7.i.i.i = and i16 %conv.i.i, 31
br label %while.body.i14.i.i.i
while.body.i14.i.i.i: ; preds = %while.body.i14.i.i.i, %while.body.lr.ph.i8.i.i.i
%Count1.2.i = phi i32 [ %Count1.0.i.ph, %while.body.lr.ph.i8.i.i.i ], [ %storemerge.i.i.i, %while.body.i14.i.i.i ]
%mask.011.i9.i.i.i = phi i32 [ %div.i5.i.i.i, %while.body.lr.ph.i8.i.i.i ], [ %div3.i20.i.i.i, %while.body.i14.i.i.i ]
%curr_size.010.i10.i.i.i = phi i32 [ %rem.i.i.i20, %while.body.lr.ph.i8.i.i.i ], [ %div2.i19.i.i.i, %while.body.i14.i.i.i ]
%conv1.i11.i.i.i = trunc i32 %mask.011.i9.i.i.i to i16
%sext.i12.i.i.i = shl i32 %mask.011.i9.i.i.i, 16
%conv.i.i.i13.i.i.i = ashr exact i32 %sext.i12.i.i.i, 16
%75 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %Count1.2.i, i32 %conv.i.i.i13.i.i.i, i32 31) #5
%76 = icmp ult i16 %conv.i7.i.i.i, %conv1.i11.i.i.i
%add.i.i.i15.i.i.i = select i1 %76, i32 %Count1.2.i, i32 0
%storemerge.i.i.i = add nsw i32 %add.i.i.i15.i.i.i, %75
%add.i18.i.i.i = add nuw i32 %curr_size.010.i10.i.i.i, 1
%div2.i19.i.i.i = lshr i32 %add.i18.i.i.i, 1
%div3.i20.i.i.i = lshr i32 %add.i18.i.i.i, 2
%cmp.i21.i.i.i = icmp eq i32 %div3.i20.i.i.i, 0
br i1 %cmp.i21.i.i.i, label %if.end15.i.i.i, label %while.body.i14.i.i.i
if.end15.i.i.i: ; preds = %while.body.i14.i.i.i, %if.then10.i.i.i, %if.then8.i.i.i
%Count1.3.i = phi i32 [ %add.i.i.4.i.i.i.i, %if.then8.i.i.i ], [ %Count1.0.i.ph, %if.then10.i.i.i ], [ %storemerge.i.i.i, %while.body.i14.i.i.i ]
%cmp16.i.i10.i = icmp ugt i32 %69, 32
br i1 %cmp16.i.i10.i, label %__kmpc_barrier.exit16.i.i.i.i, label %__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i
__kmpc_barrier.exit16.i.i.i.i: ; preds = %if.end15.i.i.i
%nvptx_lane_id.i.i.i.i = and i32 %0, 31
tail call void asm sideeffect "bar.sync $0;", "r,~{memory}"(i32 0) #7, !srcloc !41
%warp_master.i.i.i.i = icmp eq i32 %nvptx_lane_id.i.i.i.i, 0
br i1 %warp_master.i.i.i.i, label %then.i.i.i.i, label %ifcont.i.i.i.i
then.i.i.i.i: ; preds = %__kmpc_barrier.exit16.i.i.i.i
%77 = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i64 %idxprom.i.i
store volatile i32 %Count1.3.i, i32 addrspace(3)* %77, align 4, !tbaa !46
br label %ifcont.i.i.i.i
ifcont.i.i.i.i: ; preds = %then.i.i.i.i, %__kmpc_barrier.exit16.i.i.i.i
%78 = load i8, i8* %arrayidx.i.i4, align 1, !tbaa !19
%cmp.i.i.i.i.i.i = icmp eq i8 %78, -127
br i1 %cmp.i.i.i.i.i.i, label %if.then5.i.i.i.i.i,
__kmpc_barrier.exit.i.i.i.i: ; preds = %if.then5.i.i.i.i.i, %ifcont.i.i.i.i
%is_active_thread.i.i.i.i = icmp ult i32 %0, %div.i.i.i19
br i1 %is_active_thread.i.i.i.i, label %then4.i.i.i.i, label %_omp_reduction_inter_warp_copy_func.exit.i.i.i
then4.i.i.i.i: ; preds = %__kmpc_barrier.exit.i.i.i.i
%79 = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i64 %idxprom.i1.i
%80 = load volatile i32, i32 addrspace(3)* %79, align 4, !tbaa !46
br label %_omp_reduction_inter_warp_copy_func.exit.i.i.i
_omp_reduction_inter_warp_copy_func.exit.i.i.i: ; preds = %then4.i.i.i.i, %__kmpc_barrier.exit.i.i.i.i
%Count1.4.i = phi i32 [ %80, %then4.i.i.i.i ], [ %Count1.3.i, %__kmpc_barrier.exit.i.i.i.i ]
%cmp18.i.i.i = icmp eq i32 %div.i.i.i3, 0
br i1 %cmp18.i.i.i, label %if.then19.i.i.i, label %__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i
if.then19.i.i.i: ; preds = %_omp_reduction_inter_warp_copy_func.exit.i.i.i
%div.i.i.i.i = lshr i32 %sub.i.i9.i, 6
%cmp9.i.i.i.i = icmp eq i32 %div.i.i.i.i, 0
br i1 %cmp9.i.i.i.i, label %__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i, label %while.body.i.i.i.i
while.body.i.i.i.i: ; preds = %if.then19.i.i.i, %while.body.i.i.i.i
%Count1.5.i = phi i32 [ %storemerge26.i.i.i, %while.body.i.i.i.i ], [ %Count1.4.i, %if.then19.i.i.i ]
%mask.011.i.i.i.i = phi i32 [ %div3.i.i.i.i, %while.body.i.i.i.i ], [ %div.i.i.i.i, %if.then19.i.i.i ]
%curr_size.010.i.i.i.i = phi i32 [ %div2.i.i.i.i, %while.body.i.i.i.i ], [ %div.i.i.i19, %if.then19.i.i.i ]
%conv1.i.i.i.i = trunc i32 %mask.011.i.i.i.i to i16
%sext.i.i.i.i = shl i32 %mask.011.i.i.i.i, 16
%conv.i.i.i.i.i.i = ashr exact i32 %sext.i.i.i.i, 16
%81 = tail call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %Count1.5.i, i32 %conv.i.i.i.i.i.i, i32 31) #5
%82 = icmp ult i16 %conv.i.i, %conv1.i.i.i.i
%add.i.i.i.i.i.i = select i1 %82, i32 %Count1.5.i, i32 0
%storemerge26.i.i.i = add nsw i32 %add.i.i.i.i.i.i, %81
%add.i.i.i12.i = add nuw i32 %curr_size.010.i.i.i.i, 1
%div2.i.i.i.i = lshr i32 %add.i.i.i12.i, 1
%div3.i.i.i.i = lshr i32 %add.i.i.i12.i, 2
%cmp.i1.i.i.i = icmp eq i32 %div3.i.i.i.i, 0
br i1 %cmp.i1.i.i.i, label %__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i, label %while.body.i.i.i.i
__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i: ; preds = %while.body.i.i.i.i, %if.then19.i.i.i, %_omp_reduction_inter_warp_copy_func.exit.i.i.i, %if.end15.i.i.i, %if.else.i.i.i
%Count1.6.i = phi i32 [ %Count1.4.i, %if.then19.i.i.i ], [ %Count1.4.i, %_omp_reduction_inter_warp_copy_func.exit.i.i.i ], [ %Count1.3.i, %if.end15.i.i.i ], [ %Count1.0.i.ph, %if.else.i.i.i ], [ %storemerge26.i.i.i, %while.body.i.i.i.i ]
br i1 %cmp.i, label %.omp.reduction.then.i, label %__omp_outlined__.exit
.omp.reduction.then.i: ; preds = %__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i, %if.else.i3.i.i.i, %omp.dispatch.end.i
%Count1.724.i = phi i32 [ %Count1.6.i, %__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i ], [ %Count1.0.i.ph, %if.else.i3.i.i.i ], [ %Count1.0.i.ph, %omp.dispatch.end.i ]
%83 = load i32, i32* %Count, align 4, !tbaa !46
%add4.i = add nsw i32 %83, %Count1.724.i
store i32 %add4.i, i32* %Count, align 4, !tbaa !46
br label %__omp_outlined__.exit
__omp_outlined__.exit: ; preds = %__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i, %.omp.reduction.then.i
%cmp.i.pr = phi i1 [ false, %__kmpc_nvptx_parallel_reduce_nowait_v2.exit.i ], [ %cmp.i, %.omp.reduction.then.i ]
tail call void asm sideeffect "bar.sync $0;", "r,~{memory}"(i32 0) #7, !srcloc !50
br i1 %cmp.i.pr, label %if.then1.i, label %__kmpc_spmd_
%add.i.i = or i32 %87, 1
%idxprom.i3.i.i10 = zext i32 %rem.i.i9 to i64
%arrayidx.i4.i.i11 = getelementptr inbounds %class.omptarget_nvptx_Queue, %class.omptarget_nvptx_Queue* %arrayidx.i8, i64 0, i32 3, i64 %idxprom.i3.i.i10
br label %while.cond.i.i13
while.cond.i.i13: ; preds = %while.cond.i.i13, %if.then1.i
%88 = atomicrmw or i32* %arrayidx.i4.i.i11, i32 0 seq_cst
%cmp.i.i.i12 = icmp eq i32 %88, %add.i.i
br i1 %cmp.i.i.i12, label %_ZN21omptarget_nvptx_QueueI36omptarget_nvptx_ThreadPrivateContextLj32EE7EnqueueEPS0_.exit.i, label %while.cond.i.i13
_ZN21omptarget_nvptx_QueueI36omptarget_nvptx_ThreadPrivateContextLj32EE7EnqueueEPS0_.exit.i: ; preds = %while.cond.i.i13
%arrayidx.i2.i.i14 = getelementptr inbounds %class.omptarget_nvptx_Queue, %class.omptarget_nvptx_Queue* %arrayidx.i8, i64 0, i32 1, i64 %idxprom.i3.i.i10
%89 = bitcast %class.omptarget_nvptx_ThreadPrivateContext** %arrayidx.i2.i.i14 to i64*
%90 = atomicrmw xchg i64* %89, i64 %85 seq_cst
%add.i.i.i15 = add nuw nsw i32 %add.i.i, 1
%rem.i.i.i16 = and i32 %add.i.i.i15, 33554430
%91 = atomicrmw xchg i32* %arrayidx.i4.i.i11, i32 %rem.i.i.i16 seq_cst
br label %__kmpc_spmd_kernel_deinit_v2.exit
__kmpc_spmd_kernel_deinit_v2.exit: ; preds = %__omp_outlined__.exit, %_ZN21omptarget_nvptx_QueueI36omptarget_nvptx_ThreadPrivateContextLj32EE7EnqueueEPS0_.exit.i
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
; Function Attrs: nounwind
declare void @llvm.nvvm.membar.cta() #2
; Function Attrs: nounwind readnone speculatable
declare i32 @llvm.ctlz.i32(i32, i1 immarg) #3
; Function Attrs: nounwind readnone speculatable
declare i32 @llvm.ctpop.i32(i32) #3
; Function Attrs: convergent inaccessiblememonly nounwind
declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #4
; Function Attrs: convergent inaccessiblememonly nounwind
declare i32 @llvm.nvvm.vote.ballot(i1) #4
; Function Attrs: convergent inaccessiblememonly nounwind
declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32) #4
attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"=
!8 = !{null, !"align", i32 16}
!9 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!10 = !{!"clang version 9.0.0 (/dev/shm/jdoerfert/llvm-project/llvm/tools/clang 3bc6e2a7aa3853b06045c42e81af094647c48676)"}
!11 = !{!"clang version 9.0.0 "}
!12 = !{i32 1, i32 2}
!13 = !{!14, !14, i64 0}
!14 = !{!"int", !15, i64 0}
!15 = !{!"omnipotent char", !16, i64 0}
!16 = !{!"Simple C++ TBAA"}
!17 = !{i32 0, i32 1024}
!18 = !{i32 1, i32 1025}
!19 = !{!15, !15, i64 0}
!20 = !{i32 1115}
!21 = !{!22, !22, i64 0}
!22 = !{!"any pointer", !15, i64 0}
!23 = !{!24, !15, i64 40}
!24 = !{!"_ZTS25omptarget_nvptx_TaskDescr", !25, i64 0, !28, i64 40, !22, i64 56}
!25 = !{!"_ZTSN25omptarget_nvptx_TaskDescr20SavedLoopDescr_itemsE", !26, i64 0, !26, i64 8, !26, i64 16, !26, i64 24, !27, i64 32}
!26 = !{!"long", !15, i64 0}
!27 = !{!"_ZTS11kmp_sched_t", !15, i64 0}
!28 = !{!"_ZTSN25omptarget_nvptx_TaskDescr15TaskDescr_itemsE", !15, i64 0, !15, i64 1, !29, i64 2, !26, i64 8}
!29 = !{!"short", !15, i64 0}
!30 = !{!24, !29, i64 42}
!31 = !{!24, !26, i64 48}
!32 = !{i32 -2144259087}
!33 = !{!24, !22, i64 56}
!34 = !{!35, !22, i64 24}
!35 = !{!"_ZTS38__kmpc_data_sharing_worker_slot_static", !22, i64 0, !22, i64 8, !22, i64 16, !22, i64 24, !15, i64 32}
!36 = !{!35, !22, i64 0}
!37 = !{!35, !22, i64 8}
!38 = !{!35, !22, i64 16}
!39 = !{!27, !27, i64 0}
!40 = !{!26, !26, i64 0}
!41 = !{i32 -2144261549}
!42 = !{i32 0, i32 33}
!43 = !{i32 16381}
!44 = !{i32 15962}
!45 = !{i32 16120}
!46 = !{!47, !47, i64 0}
!47 = !{!"int", !48, i64 0}
!48 = !{!"omnipotent char", !49, i64 0}
!49 = !{!"Simple C/C++ TBAA"}
!50 = !{i32 -2144258395}
; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda
; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
; ModuleID = '/tmp/jdoerfert/test-a3d979.bc'
source_filename = "test.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%struct.ident_t = type { i32, i32, i32, i32, i8* }
%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any
@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
@.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer
@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 18, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
@.__omp_offloading_10_1012dd1_main_l7.region_id = weak constant i8 0
@.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 547]
@.str.1 = private unnamed_addr constant [16 x i8] c"ERROR [@%i] %i\0A\00", align 1
@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_10_1012dd1_main_l7\00"
@.omp_offloading.entry.__omp_offloading_10_1012dd1_main_l7 = weak local_unnamed_addr constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_10_1012dd1_main_l7.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8
@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8
@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
@__dso_handle = external hidden global i8
@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.requires_reg, i8* null }, { i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }]
; Function Attrs: nounwind uwtable
define dso_local i32 @main() local_unnamed_addr #0 {
entry:
%Count = alloca i32, align 4
%.offload_baseptrs = alloca [1 x i8*], align 8
%.offload_ptrs = alloca [1 x i8*], align 8
%0 = bitcast i32* %Count to i8*
%1 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
%2 = bitcast [1 x i8*]* %.offload_baseptrs to i32**
%3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
%4 = bitcast [1 x i8*]* %.offload_ptrs to i32**
br label %for.body
for.cond.cleanup: ; preds = %if.end
ret i32 0
for.body: ; preds = %if.end, %entry
%i.05 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
store i32 0, i32* %Count, align 4, !tbaa !3
store i32* %Count, i32** %2, align 8
if.then: ; preds = %omp_offload.cont
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i64 0, i64 0), i32 %i.05, i32 %8)
br label %if.end
if.end: ; preds = %omp_offload.cont, %if.then
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
%inc = add nuw nsw i32 %i.05, 1
%exitcond = icmp eq i32 %inc, 1000
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
; Function Attrs: norecurse nounwind uwtable
define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture dereferenceable(4) %Count) #2 {
entry:
%.omp.lb = alloca i32, align 4
%.omp.ub = alloca i32, align 4
%.omp.stride = alloca i32, align 4
%.omp.is_last = alloca i32, align 4
%Count1 = alloca i32, align 4
%.omp.reduction.red_list = alloca [1 x i8*], align 8
%0 = bitcast i32* %.omp.lb to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
store i32 0, i32* %.omp.lb, align 4, !tbaa !3
%1 = bitcast i32* %.omp.ub to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
store i32 999, i32* %.omp.ub, align 4, !tbaa !3
%2 = bitcast i32* %.omp.stride to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4
store i32 1, i32* %.omp.stride, align 4, !tbaa !3
%3 = bitcast i32* %.omp.is_last to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4
store i32 0, i32* %.omp.is_last, align 4, !tbaa !3
%4 = bitcast i32* %Count1 to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %4) #4
store i32 0, i32* %Count1, align 4, !tbaa !3
%5 = load i32, i32* %.global_tid., align 4, !tbaa !3
tail call void @__kmpc_dispatch_init_4(%struct.ident_t* nonnull @0, i32 %5, i32 35, i32 0, i32 999, i32 1, i32 2) #4
%6 = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* nonnull @0, i32 %5, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride) #4
%tobool12 = icmp eq i32 %6, 0
br i1 %tobool12, label %omp.dispatch.end, label %omp.dispatch.body
omp.dispatch.cond.loopexit: ; preds = %omp.inner.for.body.lr.ph, %omp.dispatch.body
%7 = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* nonnull @0, i32 %5, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride) #4
%tobool = icmp eq i32 %7, 0
br i1 %tobool, label %omp.dispatch.end, label %omp.dispatch.body
omp.dispatch.body: ; preds = %entry, %omp.dispatch.cond.loopexit
%8 = load i32, i32* %.omp.lb, align 4, !tbaa !3
%9 = load i32,
store i32 %add4, i32* %Count, align 4, !tbaa !3
call void @__kmpc_end_reduce_nowait(%struct.ident_t* nonnull @1, i32 %5, [8 x i32]* nonnull @.gomp_critical_user_.reduction.var) #4
br label %.omp.reduction.default
.omp.reduction.case2: ; preds = %omp.dispatch.end
%28 = load i32, i32* %Count1, align 4, !tbaa !3
%29 = atomicrmw add i32* %Count, i32 %28 monotonic
br label %.omp.reduction.default
.omp.reduction.default: ; preds = %.omp.reduction.case2, %.omp.reduction.case1, %omp.dispatch.end
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %4) #4
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
ret void
}
declare dso_local void @__kmpc_dispatch_init_4(%struct.ident_t*, i32, i32, i32, i32, i32, i32) local_unnamed_addr
declare dso_local i32 @__kmpc_dispatch_next_4(%struct.ident_t*, i32, i32*, i32*, i32*, i32*) local_unnamed_addr
; Function Attrs: norecurse nounwind uwtable
define internal void @.omp.reduction.reduction_func(i8* nocapture readonly, i8* nocapture readonly) #2 {
entry:
%2 = bitcast i8* %1 to i32**
%3 = load i32*, i32** %2, align 8
%4 = bitcast i8* %0 to i32**
%5 = load i32*, i32** %4, align 8
%6 = load i32, i32* %5, align 4, !tbaa !3
%7 = load i32, i32* %3, align 4, !tbaa !3
%add = add nsw i32 %7, %6
store i32 %add, i32* %5, align 4, !tbaa !3
ret void
}
declare dso_local i32 @__kmpc_reduce_nowait(%struct.ident_t*, i32, i32, i64, i8*, void (i8*, i8*)*, [8 x i32]*) local_unnamed_addr
declare dso_local void @__kmpc_end_reduce_nowait(%struct.ident_t*, i32, [8 x i32]*) local_unnamed_addr
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
declare dso_local void @__kmpc_push_num_threads(%struct.ident_t*, i32, i32) local_unnamed_addr
declare !callback !8 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) local_unnamed_addr
; Function Attrs: nounwind
declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #3
; Function Attrs: nounwind uwtable
define internal void @.omp_offloading.descriptor_unreg(i8* nocapture readnone) #0 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) {
entry:
%1 = tail call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #4
ret void
}
declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) local_unnamed_addr
; Function Attrs: nounwind uwtable
define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #0 section ".text.startup" comdat {
entry:
%0 = tail call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #4
%1 = tail call i32 @__cxa_atexit(void (i8*)* nonnull @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* nonnull @__dso_handle) #4
ret void
}
declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) local_unnamed_addr
; Function Attrs: nounwind
declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) local_unnamed_addr #4
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
attributes #2 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind }
!omp_offload.info = !{!0}
!llvm.module.flags = !{!1}
!llvm.ident = !{!2}
!0 = !{i32 0, i32 16, i32 16854481, !"main", i32 7, i32 0}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{!"clang version 9.0.0 (/dev/shm/jdoerfert/llvm-project/llvm/tools/clang 3bc6e2a7aa3853b06045c42e81af094647c48676)"}
!3 = !{!4, !4, i64 0}
!4 = !{!"int", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}
!7 = distinct !{}
!8 = !{!9}
!9 = !{i64 2, i64 -1, i64 -1, i1 true}
; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment