Skip to content

Instantly share code, notes, and snippets.

@zhuangh
Created November 22, 2023 07:54
Show Gist options
  • Save zhuangh/b1010323573bd55dc3a5a6710027f156 to your computer and use it in GitHub Desktop.
Save zhuangh/b1010323573bd55dc3a5a6710027f156 to your computer and use it in GitHub Desktop.
matmul_gtx1060.ir
IR module {
tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
%c16_i32 = arith.constant 16 : i32
%c1024_i32 = arith.constant 1024 : i32
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<16> : tensor<16x16xi32>
%cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
%0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
%1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
%2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32>
%3 = arith.muli %1, %2 : tensor<16x1xi32>
%4 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>>
%5 = tt.addptr %4, %3 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32>
%6 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<16xi32>) -> tensor<1x16xi32>
%7 = tt.broadcast %5 : (tensor<16x1x!tt.ptr<f32>>) -> tensor<16x16x!tt.ptr<f32>>
%8 = tt.broadcast %6 : (tensor<1x16xi32>) -> tensor<16x16xi32>
%9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
%10 = tt.splat %arg4 : (i32) -> tensor<16x1xi32>
%11 = arith.muli %1, %10 : tensor<16x1xi32>
%12 = tt.splat %arg1 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>>
%13 = tt.addptr %12, %11 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32>
%14 = tt.broadcast %13 : (tensor<16x1x!tt.ptr<f32>>) -> tensor<16x16x!tt.ptr<f32>>
%15 = tt.addptr %14, %8 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
%16 = arith.muli %arg4, %c16_i32 : i32
%17 = tt.splat %16 : (i32) -> tensor<16x16xi32>
%18:3 = scf.for %arg6 = %c0_i32 to %c1024_i32 step %c16_i32 iter_args(%arg7 = %cst_0, %arg8 = %9, %arg9 = %15) -> (tensor<16x16xf32>, tensor<16x16x!tt.ptr<f32>>, tensor<16x16x!tt.ptr<f32>>) : i32 {
%25 = tt.load %arg8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16xf32>
%26 = tt.load %arg9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16xf32>
%27 = tt.dot %25, %26, %arg7 {allowTF32 = true} : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32>
%28 = tt.addptr %arg8, %cst : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
%29 = tt.addptr %arg9, %17 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
scf.yield %27, %28, %29 : tensor<16x16xf32>, tensor<16x16x!tt.ptr<f32>>, tensor<16x16x!tt.ptr<f32>>
}
%19 = tt.splat %arg5 : (i32) -> tensor<16x1xi32>
%20 = arith.muli %1, %19 : tensor<16x1xi32>
%21 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>>
%22 = tt.addptr %21, %20 : tensor<16x1x!tt.ptr<f32>>, tensor<16x1xi32>
%23 = tt.broadcast %22 : (tensor<16x1x!tt.ptr<f32>>) -> tensor<16x16x!tt.ptr<f32>>
%24 = tt.addptr %23, %8 : tensor<16x16x!tt.ptr<f32>>, tensor<16x16xi32>
tt.store %24, %18#0 {cache = 1 : i32, evict = 1 : i32} : tensor<16x16xf32>
tt.return
}
}
TTGIR #blocked = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32
%c2_i32 = arith.constant 2 : i32
%cst = arith.constant dense<true> : tensor<16x16xi1, #blocked>
%c16_i32 = arith.constant 16 : i32
%c3_i32 = arith.constant 3 : i32
%cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #blocked1>
%cst_1 = arith.constant dense<16> : tensor<16x16xi32, #blocked>
%c1024_i32 = arith.constant 1024 : i32
%c0_i32 = arith.constant 0 : i32
%0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
%1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
%2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32, #blocked>
%3 = arith.muli %1, %2 : tensor<16x1xi32, #blocked>
%4 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>, #blocked>
%5 = tt.addptr %4, %3 : tensor<16x1x!tt.ptr<f32>, #blocked>, tensor<16x1xi32, #blocked>
%6 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x16xi32, #blocked>
%8 = tt.broadcast %5 : (tensor<16x1x!tt.ptr<f32>, #blocked>) -> tensor<16x16x!tt.ptr<f32>, #blocked>
%9 = tt.broadcast %7 : (tensor<1x16xi32, #blocked>) -> tensor<16x16xi32, #blocked>
%10 = tt.addptr %8, %9 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
%11 = tt.splat %arg4 : (i32) -> tensor<16x1xi32, #blocked>
%12 = arith.muli %1, %11 : tensor<16x1xi32, #blocked>
%13 = tt.splat %arg1 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>, #blocked>
%14 = tt.addptr %13, %12 : tensor<16x1x!tt.ptr<f32>, #blocked>, tensor<16x1xi32, #blocked>
%15 = tt.broadcast %14 : (tensor<16x1x!tt.ptr<f32>, #blocked>) -> tensor<16x16x!tt.ptr<f32>, #blocked>
%16 = tt.addptr %15, %9 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
%17 = arith.muli %arg4, %c16_i32 : i32
%18 = tt.splat %17 : (i32) -> tensor<16x16xi32, #blocked>
%19 = triton_gpu.alloc_tensor : tensor<3x16x16xf32, #shared>
%20 = triton_gpu.insert_slice_async %10, %19, %c0_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
triton_gpu.async_commit_group
%21 = triton_gpu.alloc_tensor : tensor<3x16x16xf32, #shared>
%22 = triton_gpu.insert_slice_async %16, %21, %c0_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
triton_gpu.async_commit_group
%23 = tt.addptr %10, %cst_1 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
%24 = tt.addptr %16, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
%25 = triton_gpu.insert_slice_async %23, %20, %c1_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
triton_gpu.async_commit_group
%26 = triton_gpu.insert_slice_async %24, %22, %c1_i32, %cst {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
triton_gpu.async_commit_group
triton_gpu.async_wait {num = 2 : i32}
%27 = triton_gpu.extract_slice %25[0, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared>
%28 = triton_gpu.extract_slice %26[0, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared>
%29 = triton_gpu.extract_slice %27[0, 0] [16, 8] [1, 1] : tensor<16x16xf32, #shared> to tensor<16x8xf32, #shared>
%30 = triton_gpu.convert_layout %29 : (tensor<16x8xf32, #shared>) -> tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>
%31 = triton_gpu.extract_slice %28[0, 0] [8, 16] [1, 1] : tensor<16x16xf32, #shared> to tensor<8x16xf32, #shared>
%32 = triton_gpu.convert_layout %31 : (tensor<8x16xf32, #shared>) -> tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>
%33:14 = scf.for %arg6 = %c0_i32 to %c1024_i32 step %c16_i32 iter_args(%arg7 = %cst_0, %arg8 = %10, %arg9 = %16, %arg10 = %25, %arg11 = %26, %arg12 = %27, %arg13 = %28, %arg14 = %23, %arg15 = %24, %arg16 = %c16_i32, %arg17 = %c2_i32, %arg18 = %c1_i32, %arg19 = %30, %arg20 = %32) -> (tensor<16x16xf32, #blocked1>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<3x16x16xf32, #shared>, tensor<3x16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, i32, i32, i32, tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>, tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>) : i32 {
%41 = triton_gpu.extract_slice %arg12[0, 8] [16, 8] [1, 1] : tensor<16x16xf32, #shared> to tensor<16x8xf32, #shared>
%42 = triton_gpu.convert_layout %41 : (tensor<16x8xf32, #shared>) -> tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>
%43 = triton_gpu.extract_slice %arg13[8, 0] [8, 16] [1, 1] : tensor<16x16xf32, #shared> to tensor<8x16xf32, #shared>
%44 = triton_gpu.convert_layout %43 : (tensor<8x16xf32, #shared>) -> tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>
%45 = tt.dot %arg19, %arg20, %arg7 {allowTF32 = true} : tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> * tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> -> tensor<16x16xf32, #blocked1>
%46 = tt.dot %42, %44, %45 {allowTF32 = true} : tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>> * tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>> -> tensor<16x16xf32, #blocked1>
%47 = tt.addptr %arg8, %cst_1 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
%48 = tt.addptr %arg9, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
%49 = arith.addi %arg16, %c16_i32 : i32
%50 = arith.cmpi slt, %49, %c1024_i32 : i32
%51 = arith.remsi %arg17, %c3_i32 : i32
%52 = arith.remsi %arg18, %c3_i32 : i32
%53 = tt.addptr %arg14, %cst_1 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
%54 = tt.addptr %arg15, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
%55 = tt.splat %50 : (i1) -> tensor<16x16xi1, #blocked>
%56 = triton_gpu.insert_slice_async %53, %arg10, %51, %55 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
triton_gpu.async_commit_group
%57 = triton_gpu.insert_slice_async %54, %arg11, %51, %55 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f32>, #blocked> -> tensor<3x16x16xf32, #shared>
triton_gpu.async_commit_group
triton_gpu.async_wait {num = 2 : i32}
%58 = triton_gpu.extract_slice %56[%52, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared>
%59 = triton_gpu.extract_slice %57[%52, 0, 0] [1, 16, 16] [1, 1, 1] : tensor<3x16x16xf32, #shared> to tensor<16x16xf32, #shared>
%60 = arith.addi %arg17, %c1_i32 : i32
%61 = arith.addi %arg18, %c1_i32 : i32
%62 = triton_gpu.extract_slice %58[0, 0] [16, 8] [1, 1] : tensor<16x16xf32, #shared> to tensor<16x8xf32, #shared>
%63 = triton_gpu.convert_layout %62 : (tensor<16x8xf32, #shared>) -> tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>
%64 = triton_gpu.extract_slice %59[0, 0] [8, 16] [1, 1] : tensor<16x16xf32, #shared> to tensor<8x16xf32, #shared>
%65 = triton_gpu.convert_layout %64 : (tensor<8x16xf32, #shared>) -> tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>
scf.yield %46, %47, %48, %56, %57, %58, %59, %53, %54, %49, %60, %61, %63, %65 : tensor<16x16xf32, #blocked1>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<3x16x16xf32, #shared>, tensor<3x16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16xf32, #shared>, tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16x!tt.ptr<f32>, #blocked>, i32, i32, i32, tensor<16x8xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked1}>>, tensor<8x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked1}>>
}
triton_gpu.async_wait {num = 0 : i32}
%34 = tt.splat %arg5 : (i32) -> tensor<16x1xi32, #blocked>
%35 = arith.muli %1, %34 : tensor<16x1xi32, #blocked>
%36 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<16x1x!tt.ptr<f32>, #blocked>
%37 = tt.addptr %36, %35 : tensor<16x1x!tt.ptr<f32>, #blocked>, tensor<16x1xi32, #blocked>
%38 = tt.broadcast %37 : (tensor<16x1x!tt.ptr<f32>, #blocked>) -> tensor<16x16x!tt.ptr<f32>, #blocked>
%39 = tt.addptr %38, %9 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
%40 = triton_gpu.convert_layout %33#0 : (tensor<16x16xf32, #blocked1>) -> tensor<16x16xf32, #blocked>
tt.store %39, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<16x16xf32, #blocked>
tt.return
}
}
LLIR ; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@global_smem = external addrspace(3) global [0 x i8]
define void @matmul_kernel_0d1d2d3d4c5d6c7d8c(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, i32 %5) local_unnamed_addr !dbg !5 {
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
%8 = and i32 %7, 31, !dbg !8
%9 = lshr i32 %7, 5, !dbg !8
%10 = and i32 %9, 3, !dbg !8
%11 = lshr i32 %8, 3, !dbg !8
%12 = shl nuw nsw i32 %10, 2, !dbg !8
%13 = or i32 %12, %11, !dbg !8
%14 = shl i32 %7, 1, !dbg !9
%15 = and i32 %14, 14, !dbg !9
%16 = mul i32 %13, %3, !dbg !10
%17 = sext i32 %16 to i64, !dbg !11
%18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !11
%19 = zext i32 %15 to i64, !dbg !12
%20 = getelementptr float, ptr addrspace(1) %18, i64 %19, !dbg !12
%21 = mul i32 %13, %4, !dbg !13
%22 = sext i32 %21 to i64, !dbg !14
%23 = getelementptr float, ptr addrspace(1) %1, i64 %22, !dbg !14
%24 = getelementptr float, ptr addrspace(1) %23, i64 %19, !dbg !15
%25 = shl i32 %4, 4, !dbg !16
%26 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %20, i1 true) #2, !dbg !17
%27 = extractvalue { i32, i32 } %26, 0, !dbg !17
%28 = extractvalue { i32, i32 } %26, 1, !dbg !17
%29 = shl nuw nsw i32 %13, 4, !dbg !17
%30 = or i32 %29, %15, !dbg !17
%31 = zext i32 %30 to i64, !dbg !17
%32 = getelementptr float, ptr addrspace(3) @global_smem, i64 %31, !dbg !17
%33 = getelementptr float, ptr addrspace(3) %32, i64 1, !dbg !17
store i32 %27, ptr addrspace(3) %32, align 4, !dbg !17
store i32 %28, ptr addrspace(3) %33, align 4, !dbg !17
%34 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %24, i1 true) #2, !dbg !18
%35 = extractvalue { i32, i32 } %34, 0, !dbg !18
%36 = extractvalue { i32, i32 } %34, 1, !dbg !18
%37 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 3072), i64 %31, !dbg !18
%38 = getelementptr float, ptr addrspace(3) %37, i64 1, !dbg !18
store i32 %35, ptr addrspace(3) %37, align 4, !dbg !18
store i32 %36, ptr addrspace(3) %38, align 4, !dbg !18
%39 = getelementptr float, ptr addrspace(1) %20, i64 16, !dbg !19
%40 = sext i32 %25 to i64, !dbg !20
%41 = getelementptr float, ptr addrspace(1) %24, i64 %40, !dbg !20
%42 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %39, i1 true) #2, !dbg !17
%43 = extractvalue { i32, i32 } %42, 0, !dbg !17
%44 = extractvalue { i32, i32 } %42, 1, !dbg !17
tail call void @llvm.nvvm.barrier0(), !dbg !17
%45 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 1024), i64 %31, !dbg !17
%46 = getelementptr float, ptr addrspace(3) %45, i64 1, !dbg !17
store i32 %43, ptr addrspace(3) %45, align 4, !dbg !17
store i32 %44, ptr addrspace(3) %46, align 4, !dbg !17
%47 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %41, i1 true) #2, !dbg !18
%48 = extractvalue { i32, i32 } %47, 0, !dbg !18
%49 = extractvalue { i32, i32 } %47, 1, !dbg !18
%50 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 4096), i64 %31, !dbg !18
%51 = getelementptr float, ptr addrspace(3) %50, i64 1, !dbg !18
store i32 %48, ptr addrspace(3) %50, align 4, !dbg !18
store i32 %49, ptr addrspace(3) %51, align 4, !dbg !18
tail call void @llvm.nvvm.barrier0(), !dbg !17
%52 = lshr i32 %7, 4, !dbg !17
%53 = and i32 %52, 7, !dbg !17
%54 = shl nuw nsw i32 %53, 4, !dbg !17
%55 = zext i32 %54 to i64, !dbg !17
%56 = and i32 %7, 15, !dbg !18
%57 = zext i32 %56 to i64, !dbg !18
br label %58, !dbg !21
58: ; preds = %6, %58
%.pn5864 = phi ptr addrspace(3) [ getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 3072), %6 ], [ %230, %58 ]
%.pn63 = phi ptr addrspace(3) [ @global_smem, %6 ], [ %224, %58 ]
%59 = phi i32 [ 1, %6 ], [ %237, %58 ]
%60 = phi i32 [ 2, %6 ], [ %236, %58 ]
%61 = phi i32 [ 16, %6 ], [ %208, %58 ]
%.pn762 = phi ptr addrspace(1) [ %41, %6 ], [ %211, %58 ]
%.pn361 = phi ptr addrspace(1) [ %39, %6 ], [ %210, %58 ]
%62 = phi { ptr addrspace(3), i32, i32, i32, i32 } [ { ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 3072), i32 16, i32 1, i32 0, i32 0 }, %6 ], [ %235, %58 ]
%63 = phi { ptr addrspace(3), i32, i32, i32, i32 } [ { ptr addrspace(3) @global_smem, i32 16, i32 1, i32 0, i32 0 }, %6 ], [ %229, %58 ]
%64 = phi { float, float } [ zeroinitializer, %6 ], [ %207, %58 ]
%65 = phi i32 [ 0, %6 ], [ %238, %58 ]
%.pn55.in = getelementptr float, ptr addrspace(3) %.pn5864, i64 %57, !dbg !18
%.pn39.in = getelementptr float, ptr addrspace(3) %.pn63, i64 %55, !dbg !17
%.pn41.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 112, !dbg !18
%.pn43.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 96, !dbg !18
%.pn45.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 80, !dbg !18
%.pn47.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 64, !dbg !18
%.pn49.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 48, !dbg !18
%.pn51.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 32, !dbg !18
%.pn53.in = getelementptr float, ptr addrspace(3) %.pn55.in, i64 16, !dbg !18
%.pn9.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 135, !dbg !17
%.pn11.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 7, !dbg !17
%.pn13.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 134, !dbg !17
%.pn15.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 6, !dbg !17
%.pn17.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 133, !dbg !17
%.pn19.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 5, !dbg !17
%.pn21.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 132, !dbg !17
%.pn23.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 4, !dbg !17
%.pn25.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 131, !dbg !17
%.pn27.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 3, !dbg !17
%.pn29.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 130, !dbg !17
%.pn31.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 2, !dbg !17
%.pn33.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 129, !dbg !17
%.pn35.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 1, !dbg !17
%.pn37.in = getelementptr float, ptr addrspace(3) %.pn39.in, i64 128, !dbg !17
%.pn55 = load float, ptr addrspace(3) %.pn55.in, align 4, !dbg !18
%.pn53 = load float, ptr addrspace(3) %.pn53.in, align 4, !dbg !18
%.pn51 = load float, ptr addrspace(3) %.pn51.in, align 4, !dbg !18
%.pn49 = load float, ptr addrspace(3) %.pn49.in, align 4, !dbg !18
%.pn47 = load float, ptr addrspace(3) %.pn47.in, align 4, !dbg !18
%.pn45 = load float, ptr addrspace(3) %.pn45.in, align 4, !dbg !18
%.pn43 = load float, ptr addrspace(3) %.pn43.in, align 4, !dbg !18
%.pn41 = load float, ptr addrspace(3) %.pn41.in, align 4, !dbg !18
%.pn39 = load float, ptr addrspace(3) %.pn39.in, align 4, !dbg !17
%.pn37 = load float, ptr addrspace(3) %.pn37.in, align 4, !dbg !17
%.pn35 = load float, ptr addrspace(3) %.pn35.in, align 4, !dbg !17
%.pn33 = load float, ptr addrspace(3) %.pn33.in, align 4, !dbg !17
%.pn31 = load float, ptr addrspace(3) %.pn31.in, align 4, !dbg !17
%.pn29 = load float, ptr addrspace(3) %.pn29.in, align 4, !dbg !17
%.pn27 = load float, ptr addrspace(3) %.pn27.in, align 4, !dbg !17
%.pn25 = load float, ptr addrspace(3) %.pn25.in, align 4, !dbg !17
%.pn23 = load float, ptr addrspace(3) %.pn23.in, align 4, !dbg !17
%.pn21 = load float, ptr addrspace(3) %.pn21.in, align 4, !dbg !17
%.pn19 = load float, ptr addrspace(3) %.pn19.in, align 4, !dbg !17
%.pn17 = load float, ptr addrspace(3) %.pn17.in, align 4, !dbg !17
%.pn15 = load float, ptr addrspace(3) %.pn15.in, align 4, !dbg !17
%.pn13 = load float, ptr addrspace(3) %.pn13.in, align 4, !dbg !17
%.pn11 = load float, ptr addrspace(3) %.pn11.in, align 4, !dbg !17
%.pn9 = load float, ptr addrspace(3) %.pn9.in, align 4, !dbg !17
%66 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %63, 0, !dbg !17
%67 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %63, 1, !dbg !17
%68 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %63, 2, !dbg !17
%69 = shl i32 %68, 3, !dbg !17
%70 = sext i32 %69 to i64, !dbg !17
%71 = getelementptr float, ptr addrspace(3) %66, i64 %70, !dbg !17
%72 = mul i32 %67, %53, !dbg !17
%73 = sext i32 %72 to i64, !dbg !17
%74 = getelementptr float, ptr addrspace(3) %71, i64 %73, !dbg !17
%75 = load float, ptr addrspace(3) %74, align 4, !dbg !17
%76 = shl i32 %67, 3, !dbg !17
%77 = sext i32 %76 to i64, !dbg !17
%78 = getelementptr float, ptr addrspace(3) %74, i64 %77, !dbg !17
%79 = load float, ptr addrspace(3) %78, align 4, !dbg !17
%80 = sext i32 %68 to i64, !dbg !17
%81 = getelementptr float, ptr addrspace(3) %74, i64 %80, !dbg !17
%82 = load float, ptr addrspace(3) %81, align 4, !dbg !17
%83 = add i32 %76, %68, !dbg !17
%84 = sext i32 %83 to i64, !dbg !17
%85 = getelementptr float, ptr addrspace(3) %74, i64 %84, !dbg !17
%86 = load float, ptr addrspace(3) %85, align 4, !dbg !17
%87 = shl i32 %68, 1, !dbg !17
%88 = sext i32 %87 to i64, !dbg !17
%89 = getelementptr float, ptr addrspace(3) %74, i64 %88, !dbg !17
%90 = load float, ptr addrspace(3) %89, align 4, !dbg !17
%91 = add i32 %76, %87, !dbg !17
%92 = sext i32 %91 to i64, !dbg !17
%93 = getelementptr float, ptr addrspace(3) %74, i64 %92, !dbg !17
%94 = load float, ptr addrspace(3) %93, align 4, !dbg !17
%95 = mul i32 %68, 3, !dbg !17
%96 = sext i32 %95 to i64, !dbg !17
%97 = getelementptr float, ptr addrspace(3) %74, i64 %96, !dbg !17
%98 = load float, ptr addrspace(3) %97, align 4, !dbg !17
%99 = add i32 %76, %95, !dbg !17
%100 = sext i32 %99 to i64, !dbg !17
%101 = getelementptr float, ptr addrspace(3) %74, i64 %100, !dbg !17
%102 = load float, ptr addrspace(3) %101, align 4, !dbg !17
%103 = shl i32 %68, 2, !dbg !17
%104 = sext i32 %103 to i64, !dbg !17
%105 = getelementptr float, ptr addrspace(3) %74, i64 %104, !dbg !17
%106 = load float, ptr addrspace(3) %105, align 4, !dbg !17
%107 = add i32 %76, %103, !dbg !17
%108 = sext i32 %107 to i64, !dbg !17
%109 = getelementptr float, ptr addrspace(3) %74, i64 %108, !dbg !17
%110 = load float, ptr addrspace(3) %109, align 4, !dbg !17
%111 = mul i32 %68, 5, !dbg !17
%112 = sext i32 %111 to i64, !dbg !17
%113 = getelementptr float, ptr addrspace(3) %74, i64 %112, !dbg !17
%114 = load float, ptr addrspace(3) %113, align 4, !dbg !17
%115 = add i32 %76, %111, !dbg !17
%116 = sext i32 %115 to i64, !dbg !17
%117 = getelementptr float, ptr addrspace(3) %74, i64 %116, !dbg !17
%118 = load float, ptr addrspace(3) %117, align 4, !dbg !17
%119 = mul i32 %68, 6, !dbg !17
%120 = sext i32 %119 to i64, !dbg !17
%121 = getelementptr float, ptr addrspace(3) %74, i64 %120, !dbg !17
%122 = load float, ptr addrspace(3) %121, align 4, !dbg !17
%123 = add i32 %76, %119, !dbg !17
%124 = sext i32 %123 to i64, !dbg !17
%125 = getelementptr float, ptr addrspace(3) %74, i64 %124, !dbg !17
%126 = load float, ptr addrspace(3) %125, align 4, !dbg !17
%127 = mul i32 %68, 7, !dbg !17
%128 = sext i32 %127 to i64, !dbg !17
%129 = getelementptr float, ptr addrspace(3) %74, i64 %128, !dbg !17
%130 = load float, ptr addrspace(3) %129, align 4, !dbg !17
%131 = add i32 %76, %127, !dbg !17
%132 = sext i32 %131 to i64, !dbg !17
%133 = getelementptr float, ptr addrspace(3) %74, i64 %132, !dbg !17
%134 = load float, ptr addrspace(3) %133, align 4, !dbg !17
%135 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %62, 0, !dbg !18
%136 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %62, 1, !dbg !18
%137 = extractvalue { ptr addrspace(3), i32, i32, i32, i32 } %62, 2, !dbg !18
%138 = shl i32 %136, 3, !dbg !18
%139 = sext i32 %138 to i64, !dbg !18
%140 = getelementptr float, ptr addrspace(3) %135, i64 %139, !dbg !18
%141 = mul i32 %137, %56, !dbg !18
%142 = sext i32 %141 to i64, !dbg !18
%143 = getelementptr float, ptr addrspace(3) %140, i64 %142, !dbg !18
%144 = load float, ptr addrspace(3) %143, align 4, !dbg !18
%145 = sext i32 %136 to i64, !dbg !18
%146 = getelementptr float, ptr addrspace(3) %143, i64 %145, !dbg !18
%147 = load float, ptr addrspace(3) %146, align 4, !dbg !18
%148 = shl i32 %136, 1, !dbg !18
%149 = sext i32 %148 to i64, !dbg !18
%150 = getelementptr float, ptr addrspace(3) %143, i64 %149, !dbg !18
%151 = load float, ptr addrspace(3) %150, align 4, !dbg !18
%152 = mul i32 %136, 3, !dbg !18
%153 = sext i32 %152 to i64, !dbg !18
%154 = getelementptr float, ptr addrspace(3) %143, i64 %153, !dbg !18
%155 = load float, ptr addrspace(3) %154, align 4, !dbg !18
%156 = shl i32 %136, 2, !dbg !18
%157 = sext i32 %156 to i64, !dbg !18
%158 = getelementptr float, ptr addrspace(3) %143, i64 %157, !dbg !18
%159 = load float, ptr addrspace(3) %158, align 4, !dbg !18
%160 = mul i32 %136, 5, !dbg !18
%161 = sext i32 %160 to i64, !dbg !18
%162 = getelementptr float, ptr addrspace(3) %143, i64 %161, !dbg !18
%163 = load float, ptr addrspace(3) %162, align 4, !dbg !18
%164 = mul i32 %136, 6, !dbg !18
%165 = sext i32 %164 to i64, !dbg !18
%166 = getelementptr float, ptr addrspace(3) %143, i64 %165, !dbg !18
%167 = load float, ptr addrspace(3) %166, align 4, !dbg !18
%168 = mul i32 %136, 7, !dbg !18
%169 = sext i32 %168 to i64, !dbg !18
%170 = getelementptr float, ptr addrspace(3) %143, i64 %169, !dbg !18
%171 = load float, ptr addrspace(3) %170, align 4, !dbg !18
%172 = extractvalue { float, float } %64, 0, !dbg !22
%173 = extractvalue { float, float } %64, 1, !dbg !22
%174 = tail call float @llvm.fmuladd.f32(float %.pn39, float %.pn55, float %172), !dbg !22
%175 = tail call float @llvm.fmuladd.f32(float %.pn37, float %.pn55, float %173), !dbg !22
%176 = tail call float @llvm.fmuladd.f32(float %.pn35, float %.pn53, float %174), !dbg !22
%177 = tail call float @llvm.fmuladd.f32(float %.pn33, float %.pn53, float %175), !dbg !22
%178 = tail call float @llvm.fmuladd.f32(float %.pn31, float %.pn51, float %176), !dbg !22
%179 = tail call float @llvm.fmuladd.f32(float %.pn29, float %.pn51, float %177), !dbg !22
%180 = tail call float @llvm.fmuladd.f32(float %.pn27, float %.pn49, float %178), !dbg !22
%181 = tail call float @llvm.fmuladd.f32(float %.pn25, float %.pn49, float %179), !dbg !22
%182 = tail call float @llvm.fmuladd.f32(float %.pn23, float %.pn47, float %180), !dbg !22
%183 = tail call float @llvm.fmuladd.f32(float %.pn21, float %.pn47, float %181), !dbg !22
%184 = tail call float @llvm.fmuladd.f32(float %.pn19, float %.pn45, float %182), !dbg !22
%185 = tail call float @llvm.fmuladd.f32(float %.pn17, float %.pn45, float %183), !dbg !22
%186 = tail call float @llvm.fmuladd.f32(float %.pn15, float %.pn43, float %184), !dbg !22
%187 = tail call float @llvm.fmuladd.f32(float %.pn13, float %.pn43, float %185), !dbg !22
%188 = tail call float @llvm.fmuladd.f32(float %.pn11, float %.pn41, float %186), !dbg !22
%189 = tail call float @llvm.fmuladd.f32(float %.pn9, float %.pn41, float %187), !dbg !22
%190 = tail call float @llvm.fmuladd.f32(float %75, float %144, float %188), !dbg !22
%191 = tail call float @llvm.fmuladd.f32(float %79, float %144, float %189), !dbg !22
%192 = tail call float @llvm.fmuladd.f32(float %82, float %147, float %190), !dbg !22
%193 = tail call float @llvm.fmuladd.f32(float %86, float %147, float %191), !dbg !22
%194 = tail call float @llvm.fmuladd.f32(float %90, float %151, float %192), !dbg !22
%195 = tail call float @llvm.fmuladd.f32(float %94, float %151, float %193), !dbg !22
%196 = tail call float @llvm.fmuladd.f32(float %98, float %155, float %194), !dbg !22
%197 = tail call float @llvm.fmuladd.f32(float %102, float %155, float %195), !dbg !22
%198 = tail call float @llvm.fmuladd.f32(float %106, float %159, float %196), !dbg !22
%199 = tail call float @llvm.fmuladd.f32(float %110, float %159, float %197), !dbg !22
%200 = tail call float @llvm.fmuladd.f32(float %114, float %163, float %198), !dbg !22
%201 = tail call float @llvm.fmuladd.f32(float %118, float %163, float %199), !dbg !22
%202 = tail call float @llvm.fmuladd.f32(float %122, float %167, float %200), !dbg !22
%203 = tail call float @llvm.fmuladd.f32(float %126, float %167, float %201), !dbg !22
%204 = tail call float @llvm.fmuladd.f32(float %130, float %171, float %202), !dbg !22
%205 = tail call float @llvm.fmuladd.f32(float %134, float %171, float %203), !dbg !22
%206 = insertvalue { float, float } undef, float %204, 0, !dbg !22
%207 = insertvalue { float, float } %206, float %205, 1, !dbg !22
%208 = add nuw nsw i32 %61, 16, !dbg !21
%209 = icmp ult i32 %61, 1008, !dbg !21
%.urem = urem i32 %60, 3
%.urem66 = urem i32 %59, 3
%210 = getelementptr float, ptr addrspace(1) %.pn361, i64 16, !dbg !19
%211 = getelementptr float, ptr addrspace(1) %.pn762, i64 %40, !dbg !20
%212 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %210, i1 %209) #2, !dbg !17
%213 = extractvalue { i32, i32 } %212, 0, !dbg !17
%214 = extractvalue { i32, i32 } %212, 1, !dbg !17
tail call void @llvm.nvvm.barrier0(), !dbg !17
%215 = shl nuw nsw i32 %.urem, 8, !dbg !17
%216 = zext i32 %215 to i64
%gep = getelementptr float, ptr addrspace(3) %32, i64 %216, !dbg !17
%217 = getelementptr float, ptr addrspace(3) %gep, i64 1, !dbg !17
store i32 %213, ptr addrspace(3) %gep, align 4, !dbg !17
store i32 %214, ptr addrspace(3) %217, align 4, !dbg !17
%218 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %211, i1 %209) #2, !dbg !18
%219 = extractvalue { i32, i32 } %218, 0, !dbg !18
%220 = extractvalue { i32, i32 } %218, 1, !dbg !18
%gep60 = getelementptr float, ptr addrspace(3) %37, i64 %216, !dbg !18
%221 = getelementptr float, ptr addrspace(3) %gep60, i64 1, !dbg !18
store i32 %219, ptr addrspace(3) %gep60, align 4, !dbg !18
store i32 %220, ptr addrspace(3) %221, align 4, !dbg !18
%222 = shl nuw nsw i32 %.urem66, 8, !dbg !17
%223 = zext i32 %222 to i64
%224 = getelementptr float, ptr addrspace(3) @global_smem, i64 %223, !dbg !17
%225 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } undef, ptr addrspace(3) %224, 0, !dbg !17
%226 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %225, i32 16, 1, !dbg !17
%227 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %226, i32 1, 2, !dbg !17
%228 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %227, i32 0, 3, !dbg !17
%229 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %228, i32 0, 4, !dbg !17
%230 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 3072), i64 %223, !dbg !18
%231 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } undef, ptr addrspace(3) %230, 0, !dbg !18
%232 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %231, i32 16, 1, !dbg !18
%233 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %232, i32 1, 2, !dbg !18
%234 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %233, i32 0, 3, !dbg !18
%235 = insertvalue { ptr addrspace(3), i32, i32, i32, i32 } %234, i32 0, 4, !dbg !18
%236 = add nuw nsw i32 %60, 1, !dbg !21
%237 = add nuw nsw i32 %59, 1, !dbg !21
tail call void @llvm.nvvm.barrier0(), !dbg !17
%238 = add nuw nsw i32 %65, 16, !dbg !21
%239 = icmp ult i32 %65, 1008, !dbg !21
br i1 %239, label %58, label %240, !dbg !21
240: ; preds = %58
%241 = lshr i32 %8, 4, !dbg !23
%242 = shl nuw nsw i32 %10, 1, !dbg !23
%243 = or i32 %242, %241, !dbg !23
%244 = mul i32 %13, %5, !dbg !24
%245 = sext i32 %244 to i64, !dbg !25
%246 = getelementptr float, ptr addrspace(1) %2, i64 %245, !dbg !25
%247 = getelementptr float, ptr addrspace(1) %246, i64 %19, !dbg !26
tail call void @llvm.nvvm.barrier0(), !dbg !23
%248 = mul nuw nsw i32 %243, 18, !dbg !23
%249 = add nuw nsw i32 %248, %56, !dbg !23
%250 = zext i32 %249 to i64, !dbg !23
%251 = getelementptr float, ptr addrspace(3) @global_smem, i64 %250, !dbg !23
%252 = insertelement <1 x float> undef, float %204, i64 0, !dbg !23
store <1 x float> %252, ptr addrspace(3) %251, align 4, !dbg !23
%253 = add nuw nsw i32 %249, 144, !dbg !23
%254 = zext i32 %253 to i64, !dbg !23
%255 = getelementptr float, ptr addrspace(3) @global_smem, i64 %254, !dbg !23
%256 = insertelement <1 x float> undef, float %205, i64 0, !dbg !23
store <1 x float> %256, ptr addrspace(3) %255, align 4, !dbg !23
tail call void @llvm.nvvm.barrier0(), !dbg !23
%257 = mul nuw nsw i32 %13, 18, !dbg !23
%258 = add nuw nsw i32 %257, %15, !dbg !23
%259 = zext i32 %258 to i64, !dbg !23
%260 = getelementptr float, ptr addrspace(3) @global_smem, i64 %259, !dbg !23
%.extract = load i32, ptr addrspace(3) %260, align 8, !dbg !23
%261 = getelementptr inbounds <2 x i32>, ptr addrspace(3) %260, i64 0, i64 1, !dbg !23
%.extract57 = load i32, ptr addrspace(3) %261, align 4, !dbg !23
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %.extract, i32 %.extract57, ptr addrspace(1) %247, i1 true) #2, !dbg !23
ret void, !dbg !27
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #1
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.fmuladd.f32(float, float, float) #0
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind }
attributes #2 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.dbg.cu = !{!1}
!nvvm.annotations = !{!3, !4, !4, !3}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!2 = !DIFile(filename: "run_triton_matmul.py", directory: "/home/hzhuang/code")
!3 = !{ptr @matmul_kernel_0d1d2d3d4c5d6c7d8c, !"kernel", i32 1}
!4 = !{ptr @matmul_kernel_0d1d2d3d4c5d6c7d8c, !"maxntidx", i32 128}
!5 = distinct !DISubprogram(name: "matmul_kernel_0d1d2d3d4c5d6c7d8c", linkageName: "matmul_kernel_0d1d2d3d4c5d6c7d8c", scope: !2, file: !2, line: 171, type: !6, scopeLine: 171, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
!7 = !{}
!8 = !DILocation(line: 183, column: 28, scope: !5)
!9 = !DILocation(line: 183, column: 58, scope: !5)
!10 = !DILocation(line: 183, column: 39, scope: !5)
!11 = !DILocation(line: 183, column: 21, scope: !5)
!12 = !DILocation(line: 183, column: 51, scope: !5)
!13 = !DILocation(line: 184, column: 39, scope: !5)
!14 = !DILocation(line: 184, column: 21, scope: !5)
!15 = !DILocation(line: 184, column: 51, scope: !5)
!16 = !DILocation(line: 191, column: 33, scope: !5)
!17 = !DILocation(line: 187, column: 20, scope: !5)
!18 = !DILocation(line: 188, column: 20, scope: !5)
!19 = !DILocation(line: 190, column: 18, scope: !5)
!20 = !DILocation(line: 191, column: 18, scope: !5)
!21 = !DILocation(line: 186, column: 25, scope: !5)
!22 = !DILocation(line: 189, scope: !5)
!23 = !DILocation(line: 194, column: 21, scope: !5)
!24 = !DILocation(line: 193, column: 39, scope: !5)
!25 = !DILocation(line: 193, column: 21, scope: !5)
!26 = !DILocation(line: 193, column: 51, scope: !5)
!27 = !DILocation(line: 194, column: 4, scope: !5)
['__class__', '__class_getitem__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__missing__', '__ne__', '__new__', '__or__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'clear', 'copy', 'default_factory', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment