Skip to content

Instantly share code, notes, and snippets.

@antiagainst
Created March 27, 2024 03:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save antiagainst/acb8c2b7fdc9d538210d42b0c62858cd to your computer and use it in GitHub Desktop.
Save antiagainst/acb8c2b7fdc9d538210d42b0c62858cd to your computer and use it in GitHub Desktop.
#loc = loc(unknown)
module attributes {"triton_gpu.compute-capability" = 90 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.shared = 0 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 loc(#loc)
llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64 loc(#loc)
llvm.func @__ockl_printf_begin(i64) -> i64 loc(#loc)
llvm.mlir.global internal constant @printfFormat_0("pid (%u, %u, %u) idx (%3u) x: %u\0A\00") {addr_space = 0 : i32} loc(#loc)
llvm.mlir.global internal constant @printfPrefix_0(" x: ") {addr_space = 0 : i32} loc(#loc)
llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8> loc(#loc)
llvm.func @kernel_device_print_0d1d(%arg0: !llvm.ptr<1> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !llvm.ptr<1> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false, nvvm.kernel = 1 : ui1, nvvm.maxntid = array<i32: 128>} {
%0 = llvm.mlir.constant(5 : i32) : i32 loc(#loc)
%1 = llvm.mlir.constant(34 : i64) : i64 loc(#loc)
%2 = llvm.mlir.constant(0 : i64) : i64 loc(#loc)
%3 = llvm.mlir.constant(true) : i1 loc(#loc)
%4 = llvm.mlir.constant(0 : i32) : i32 loc(#loc)
%5 = llvm.mlir.constant(1 : i32) : i32 loc(#loc)
%6 = llvm.mlir.constant(128 : i32) : i32 loc(#loc)
%7 = llvm.mlir.constant(2 : i32) : i32 loc(#loc)
%8 = llvm.mlir.constant(4 : i32) : i32 loc(#loc)
%9 = llvm.mlir.constant(64 : i32) : i32 loc(#loc)
%10 = llvm.mlir.constant(0 : index) : i32 loc(#loc)
%11 = rocdl.workitem.id.x : i32 loc(#loc)
%12 = llvm.urem %11, %9 : i32 loc(#loc)
%13 = llvm.udiv %11, %9 : i32 loc(#loc)
%14 = llvm.urem %13, %8 : i32 loc(#loc)
%15 = llvm.urem %12, %9 : i32 loc(#loc)
%16 = llvm.urem %14, %7 : i32 loc(#loc)
%17 = llvm.urem %15, %6 : i32 loc(#loc)
%18 = llvm.mul %16, %9 : i32 loc(#loc)
%19 = llvm.add %17, %18 : i32 loc(#loc)
%20 = llvm.mul %19, %5 : i32 loc(#loc)
%21 = llvm.urem %4, %5 : i32 loc(#loc)
%22 = llvm.urem %21, %5 : i32 loc(#loc)
%23 = llvm.mul %22, %6 : i32 loc(#loc)
%24 = llvm.add %20, %23 : i32 loc(#loc)
%25 = llvm.add %24, %4 : i32 loc(#loc)
%26 = llvm.add %25, %10 : i32 loc(#loc)
%27 = llvm.getelementptr %arg0[%26] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i32 loc(#loc)
%28 = llvm.addrspacecast %27 : !llvm.ptr<1> to !llvm.ptr loc(#loc)
llvm.cond_br %3, ^bb1, ^bb2 loc(#loc)
^bb1: // pred: ^bb0
%29 = llvm.load %28 : !llvm.ptr -> i32 loc(#loc)
llvm.br ^bb3(%29 : i32) loc(#loc)
^bb2: // pred: ^bb0
llvm.br ^bb3(%4 : i32) loc(#loc)
^bb3(%30: i32 loc(unknown)): // 2 preds: ^bb1, ^bb2
llvm.br ^bb4 loc(#loc)
^bb4: // pred: ^bb3
%31 = llvm.bitcast %30 : i32 to vector<1xi32> loc(#loc)
%32 = llvm.extractelement %31[%10 : i32] : vector<1xi32> loc(#loc)
%33 = rocdl.workgroup.id.x : i32 loc(#loc)
%34 = rocdl.workgroup.id.y : i32 loc(#loc)
%35 = rocdl.workgroup.id.z : i32 loc(#loc)
%36 = llvm.mlir.addressof @printfFormat_0 : !llvm.ptr loc(#loc)
%37 = llvm.call @__ockl_printf_begin(%2) : (i64) -> i64 loc(#loc)
%38 = llvm.call @__ockl_printf_append_string_n(%37, %36, %1, %4) : (i64, !llvm.ptr, i64, i32) -> i64 loc(#loc)
%39 = llvm.sext %33 : i32 to i64 loc(#loc)
%40 = llvm.sext %34 : i32 to i64 loc(#loc)
%41 = llvm.sext %35 : i32 to i64 loc(#loc)
%42 = llvm.sext %25 : i32 to i64 loc(#loc)
%43 = llvm.sext %32 : i32 to i64 loc(#loc)
%44 = llvm.call @__ockl_printf_append_args(%38, %0, %39, %40, %41, %42, %43, %2, %2, %5) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 loc(#loc)
%45 = llvm.getelementptr %arg1[%26] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i32 loc(#loc)
%46 = llvm.mul %14, %9 : i32 loc(#loc)
%47 = llvm.add %46, %15 : i32 loc(#loc)
%48 = llvm.mul %47, %5 : i32 loc(#loc)
%49 = llvm.icmp "slt" %48, %6 : i32 loc(#loc)
%50 = llvm.and %3, %49 : i1 loc(#loc)
%51 = llvm.mlir.undef : vector<1xi32> loc(#loc)
%52 = llvm.insertelement %32, %51[%4 : i32] : vector<1xi32> loc(#loc)
%53 = llvm.bitcast %52 : vector<1xi32> to i32 loc(#loc)
llvm.cond_br %50, ^bb5, ^bb6 loc(#loc)
^bb5: // pred: ^bb4
llvm.store %53, %45 : i32, !llvm.ptr<1> loc(#loc)
llvm.br ^bb6 loc(#loc)
^bb6: // 2 preds: ^bb4, ^bb5
llvm.return loc(#loc)
} loc(#loc)
} loc(#loc)
module attributes {gpu.container_module} {
gpu.module @kernels [#rocdl.target<chip = "gfx942">] {
llvm.mlir.global internal constant @printfFormat_0("Hello from %d\0A\00") {addr_space = 0 : i32}
llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64
llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
llvm.func @__ockl_printf_begin(i64) -> i64
llvm.func @hello() attributes {gpu.kernel, rocdl.kernel} {
%0 = rocdl.workitem.id.x : i32
%1 = llvm.mlir.constant(0 : i64) : i64
%2 = llvm.call @__ockl_printf_begin(%1) : (i64) -> i64
%3 = llvm.mlir.addressof @printfFormat_0 : !llvm.ptr
%4 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<15 x i8>
%5 = llvm.mlir.constant(15 : i64) : i64
%6 = llvm.mlir.constant(1 : i32) : i32
%7 = llvm.mlir.constant(0 : i32) : i32
%8 = llvm.call @__ockl_printf_append_string_n(%2, %4, %5, %7) : (i64, !llvm.ptr, i64, i32) -> i64
%9 = llvm.mlir.constant(1 : i32) : i32
%10 = llvm.zext %0 : i32 to i64
%11 = llvm.call @__ockl_printf_append_args(%8, %9, %10, %1, %1, %1, %1, %1, %1, %6) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
llvm.return
}
}
llvm.func @main() {
%0 = llvm.mlir.constant(2 : index) : i64
%1 = llvm.mlir.constant(1 : index) : i64
gpu.launch_func @kernels::@hello blocks in (%1, %1, %1) threads in (%0, %1, %1) : i64
llvm.return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment