Skip to content

Instantly share code, notes, and snippets.

@KoolJBlack
Created January 31, 2024 19:44
Show Gist options
  • Save KoolJBlack/cec5481510e3852d87e070f2dcc22aec to your computer and use it in GitHub Desktop.
Save KoolJBlack/cec5481510e3852d87e070f2dcc22aec to your computer and use it in GitHub Desktop.
// -----// IR Dump After DumpExecutableSourcesPass (iree-hal-dump-executable-sources) //----- //
#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu = "znver2", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+bmi,+bmi2,+aes,+pclmul,+adx,+clflushopt,+clwb,+f16c,+clzero,+cx16,+cx8,+crc32,+fsgsbase,+fxsr,+lzcnt,+movbe,+mwaitx,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sha,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 32 : index, target_triple = "x86_64-unknown-linux-gnu", ukernels = "default"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#pipeline_layout1 = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>]>]>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_system_elf_x86_64_]}>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
hal.executable private @simpul_mul_dispatch_0 {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
hal.executable.export public @simpul_mul_dispatch_0_pack_f32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simpul_mul_dispatch_0_pack_f32() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x4096xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<512x4096x8x1xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf32>> -> tensor<4096x4096xf32>
%3 = tensor.empty() : tensor<512x4096x8x1xf32>
%pack = tensor.pack %2 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 : tensor<4096x4096xf32> -> tensor<512x4096x8x1xf32>
flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [512, 4096, 8, 1], strides = [1, 1, 1, 1] : tensor<512x4096x8x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x4096x8x1xf32>>
return
}
}
}
}
hal.executable private @simpul_mul_dispatch_1 {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
hal.executable.export public @simpul_mul_dispatch_1_pack_f32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simpul_mul_dispatch_1_pack_f32() {
%c0 = arith.constant 0 : index
%c67108864 = arith.constant 67108864 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x4096xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c67108864) : !flow.dispatch.tensor<writeonly:tensor<512x4096x8x1xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x4096xf32>> -> tensor<4096x4096xf32>
%3 = tensor.empty() : tensor<512x4096x8x1xf32>
%pack = tensor.pack %2 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 1] into %3 : tensor<4096x4096xf32> -> tensor<512x4096x8x1xf32>
flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [512, 4096, 8, 1], strides = [1, 1, 1, 1] : tensor<512x4096x8x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x4096x8x1xf32>>
return
}
}
}
}
hal.executable private @simpul_mul_dispatch_2 {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
hal.executable.export public @simpul_mul_dispatch_2_pack_f32 ordinal(0) layout(#pipeline_layout1) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simpul_mul_dispatch_2_pack_f32() {
%c134217728 = arith.constant 134217728 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c134217728) : !flow.dispatch.tensor<writeonly:tensor<512x512x8x8xf32>>
%1 = tensor.empty() : tensor<512x512x8x8xf32>
%2 = tensor.empty() : tensor<4096x4096xf32>
%pack = tensor.pack %2 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %1 : tensor<4096x4096xf32> -> tensor<512x512x8x8xf32>
flow.dispatch.tensor.store %pack, %0, offsets = [0, 0, 0, 0], sizes = [512, 512, 8, 8], strides = [1, 1, 1, 1] : tensor<512x512x8x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x512x8x8xf32>>
return
}
}
}
}
hal.executable private @simpul_mul_dispatch_3 {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
hal.executable.export public @simpul_mul_dispatch_3_mmt4d_512x512x4096x8x8x1_f32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simpul_mul_dispatch_3_mmt4d_512x512x4096x8x8x1_f32() {
%c0 = arith.constant 0 : index
%c67108864 = arith.constant 67108864 : index
%c134217728 = arith.constant 134217728 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x4096x8x1xf32>>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c67108864) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x4096x8x1xf32>>
%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c134217728) : !flow.dispatch.tensor<readwrite:tensor<512x512x8x8xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [512, 4096, 8, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<512x4096x8x1xf32>> -> tensor<512x4096x8x1xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [512, 4096, 8, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<512x4096x8x1xf32>> -> tensor<512x4096x8x1xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [512, 512, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<512x512x8x8xf32>> -> tensor<512x512x8x8xf32>
%6 = linalg.mmt4d ins(%3, %4 : tensor<512x4096x8x1xf32>, tensor<512x4096x8x1xf32>) outs(%5 : tensor<512x512x8x8xf32>) -> tensor<512x512x8x8xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [512, 512, 8, 8], strides = [1, 1, 1, 1] : tensor<512x512x8x8xf32> -> !flow.dispatch.tensor<readwrite:tensor<512x512x8x8xf32>>
return
}
}
}
}
hal.executable private @simpul_mul_dispatch_4 {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
hal.executable.export public @simpul_mul_dispatch_4_unpack_f32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @simpul_mul_dispatch_4_unpack_f32() {
%c134217728 = arith.constant 134217728 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c134217728) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x512x8x8xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096x4096xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [512, 512, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<512x512x8x8xf32>> -> tensor<512x512x8x8xf32>
%3 = tensor.empty() : tensor<4096x4096xf32>
%unpack = tensor.unpack %2 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %3 : tensor<512x512x8x8xf32> -> tensor<4096x4096xf32>
flow.dispatch.tensor.store %unpack, %1, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : tensor<4096x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096x4096xf32>>
return
}
}
}
}
func.func @simpul_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @simpul_mul(%input0: tensor<4096x4096xf32>, %input1: tensor<4096x4096xf32>) -> (%output0: tensor<4096x4096xf32>)"}} {
%c201326592 = arith.constant 201326592 : index
%c67108864 = arith.constant 67108864 : index
%c0 = arith.constant 0 : index
%c4096 = arith.constant 4096 : index
%element_type_f32 = hal.element_type<f32> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4096, %c4096]) type(%element_type_f32) encoding(%dense_row_major)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4096x4096xf32> in !stream.resource<external>{%c67108864}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%c4096, %c4096]) type(%element_type_f32) encoding(%dense_row_major)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4096x4096xf32> in !stream.resource<external>{%c67108864}
%result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c67108864} => !stream.timepoint
%result_0, %result_timepoint_1 = stream.resource.alloca uninitialized : !stream.resource<transient>{%c201326592} => !stream.timepoint
%2 = stream.timepoint.join max(%result_timepoint, %result_timepoint_1) => !stream.timepoint
%3 = stream.cmd.execute await(%2) => with(%0 as %arg2: !stream.resource<external>{%c67108864}, %1 as %arg3: !stream.resource<external>{%c67108864}, %result as %arg4: !stream.resource<external>{%c67108864}, %result_0 as %arg5: !stream.resource<transient>{%c201326592}) {
stream.cmd.concurrent {
stream.cmd.dispatch @simpul_mul_dispatch_0::@system_elf_x86_64::@simpul_mul_dispatch_0_pack_f32 {
ro %arg2[%c0 for %c67108864] : !stream.resource<external>{%c67108864},
wo %arg5[%c0 for %c201326592] : !stream.resource<transient>{%c201326592}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.dispatch @simpul_mul_dispatch_1::@system_elf_x86_64::@simpul_mul_dispatch_1_pack_f32 {
ro %arg3[%c0 for %c67108864] : !stream.resource<external>{%c67108864},
wo %arg5[%c0 for %c201326592] : !stream.resource<transient>{%c201326592}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.dispatch @simpul_mul_dispatch_2::@system_elf_x86_64::@simpul_mul_dispatch_2_pack_f32 {
wo %arg5[%c0 for %c201326592] : !stream.resource<transient>{%c201326592}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]}
}
stream.cmd.dispatch @simpul_mul_dispatch_3::@system_elf_x86_64::@simpul_mul_dispatch_3_mmt4d_512x512x4096x8x8x1_f32 {
ro %arg5[%c0 for %c201326592] : !stream.resource<transient>{%c201326592},
rw %arg5[%c0 for %c201326592] : !stream.resource<transient>{%c201326592}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
stream.cmd.dispatch @simpul_mul_dispatch_4::@system_elf_x86_64::@simpul_mul_dispatch_4_unpack_f32 {
ro %arg5[%c0 for %c201326592] : !stream.resource<transient>{%c201326592},
wo %arg4[%c0 for %c67108864] : !stream.resource<external>{%c67108864}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]}
} => !stream.timepoint
%4 = stream.resource.dealloca await(%3) => %result_0 : !stream.resource<transient>{%c201326592} => !stream.timepoint
%5 = stream.timepoint.await %4 => %result : !stream.resource<external>{%c67108864}
%6 = stream.tensor.export %5 : tensor<4096x4096xf32> in !stream.resource<external>{%c67108864} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump Before SerializeExecutablesPass (iree-hal-serialize-executables) //----- //
hal.executable private @matmul_input_linked_llvm_cpu {
hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {cpu = "znver2", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+bmi,+bmi2,+aes,+pclmul,+adx,+clflushopt,+clwb,+f16c,+clzero,+cx16,+cx8,+crc32,+fsgsbase,+fxsr,+lzcnt,+movbe,+mwaitx,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sha,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 32 : index, target_triple = "x86_64-unknown-linux-gnu", ukernels = "default"}>) {
hal.executable.export public @simpul_mul_dispatch_0_pack_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
^bb0(%arg0: !hal.device):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
hal.return %c1, %c16, %c1 : index, index, index
}
hal.executable.export public @simpul_mul_dispatch_1_pack_f32 ordinal(1) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
^bb0(%arg0: !hal.device):
%c8 = arith.constant 8 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
hal.return %c8, %c2, %c1 : index, index, index
}
hal.executable.export public @simpul_mul_dispatch_2_pack_f32 ordinal(2) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
^bb0(%arg0: !hal.device):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
hal.return %c1, %c16, %c1 : index, index, index
}
hal.executable.export public @simpul_mul_dispatch_3_mmt4d_512x512x4096x8x8x1_f32 ordinal(3) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>} {
^bb0(%arg0: !hal.device):
%c512 = arith.constant 512 : index
%c1 = arith.constant 1 : index
hal.return %c512, %c512, %c1 : index, index, index
}
hal.executable.export public @simpul_mul_dispatch_4_unpack_f32 ordinal(4) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
^bb0(%arg0: !hal.device):
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
hal.return %c1, %c16, %c1 : index, index, index
}
builtin.module {
llvm.func @simpul_mul_dispatch_0_pack_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(32768 : index) : i64
%2 = llvm.mlir.constant(63 : index) : i64
%3 = llvm.mlir.constant(8 : index) : i64
%4 = llvm.mlir.constant(1 : index) : i64
%5 = llvm.mlir.constant(32 : index) : i64
%6 = llvm.mlir.constant(4096 : index) : i64
%7 = llvm.mlir.constant(512 : index) : i64
%8 = llvm.mlir.constant(0 : index) : i64
%9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
%12 = llvm.ptrtoint %11 : !llvm.ptr to i64
%13 = llvm.and %12, %2 : i64
%14 = llvm.icmp "eq" %13, %8 : i64
"llvm.intr.assume"(%14) : (i1) -> ()
%15 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%16 = llvm.extractvalue %15[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%17 = llvm.getelementptr %16[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
%18 = llvm.load %17 : !llvm.ptr -> !llvm.ptr
%19 = llvm.ptrtoint %18 : !llvm.ptr to i64
%20 = llvm.and %19, %2 : i64
%21 = llvm.icmp "eq" %20, %8 : i64
"llvm.intr.assume"(%21) : (i1) -> ()
%22 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%23 = llvm.extractvalue %22[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%24 = llvm.zext %23 : i32 to i64
%25 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%26 = llvm.extractvalue %25[4] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%27 = llvm.zext %26 : i32 to i64
%28 = llvm.extractvalue %22[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%29 = llvm.zext %28 : i32 to i64
%30 = llvm.extractvalue %25[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%31 = llvm.zext %30 : i32 to i64
%32 = llvm.mul %29, %5 : i64
%33 = llvm.mul %31, %5 : i64
%34 = llvm.mul %24, %6 : i64
%35 = llvm.mul %27, %6 : i64
llvm.br ^bb1(%32 : i64)
^bb1(%36: i64): // 2 preds: ^bb0, ^bb10
%37 = llvm.icmp "slt" %36, %7 : i64
llvm.cond_br %37, ^bb2(%34 : i64), ^bb11
^bb2(%38: i64): // 2 preds: ^bb1, ^bb9
%39 = llvm.icmp "slt" %38, %6 : i64
llvm.cond_br %39, ^bb3(%8 : i64), ^bb10
^bb3(%40: i64): // 2 preds: ^bb2, ^bb8
%41 = llvm.icmp "slt" %40, %5 : i64
llvm.cond_br %41, ^bb4(%8 : i64), ^bb9
^bb4(%42: i64): // 2 preds: ^bb3, ^bb7
%43 = llvm.icmp "slt" %42, %6 : i64
llvm.cond_br %43, ^bb5(%8 : i64), ^bb8
^bb5(%44: i64): // 2 preds: ^bb4, ^bb6
%45 = llvm.icmp "slt" %44, %3 : i64
llvm.cond_br %45, ^bb6, ^bb7
^bb6: // pred: ^bb5
%46 = llvm.mul %36, %3 : i64
%47 = llvm.add %46, %44 : i64
%48 = llvm.mul %40, %3 : i64
%49 = llvm.add %47, %48 : i64
%50 = llvm.add %38, %42 : i64
%51 = llvm.mul %49, %6 : i64
%52 = llvm.add %51, %50 : i64
%53 = llvm.getelementptr %11[%52] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%54 = llvm.load %53 : !llvm.ptr -> f32
%55 = llvm.add %36, %40 : i64
%56 = llvm.mul %55, %1 : i64
%57 = llvm.mul %50, %3 : i64
%58 = llvm.add %56, %57 : i64
%59 = llvm.add %58, %44 : i64
%60 = llvm.add %59, %8 : i64
%61 = llvm.getelementptr %18[%60] : (!llvm.ptr, i64) -> !llvm.ptr, f32
llvm.store %54, %61 : f32, !llvm.ptr
%62 = llvm.add %44, %4 : i64
llvm.br ^bb5(%62 : i64)
^bb7: // pred: ^bb5
%63 = llvm.add %42, %4 : i64
llvm.br ^bb4(%63 : i64)
^bb8: // pred: ^bb4
%64 = llvm.add %40, %4 : i64
llvm.br ^bb3(%64 : i64)
^bb9: // pred: ^bb3
%65 = llvm.add %38, %35 : i64
llvm.br ^bb2(%65 : i64)
^bb10: // pred: ^bb2
%66 = llvm.add %36, %33 : i64
llvm.br ^bb1(%66 : i64)
^bb11: // pred: ^bb1
llvm.return %0 : i32
}
llvm.func @simpul_mul_dispatch_1_pack_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(8 : index) : i64
%2 = llvm.mlir.constant(32768 : index) : i64
%3 = llvm.mlir.constant(63 : index) : i64
%4 = llvm.mlir.constant(1 : index) : i64
%5 = llvm.mlir.constant(256 : index) : i64
%6 = llvm.mlir.constant(4096 : index) : i64
%7 = llvm.mlir.constant(512 : index) : i64
%8 = llvm.mlir.constant(0 : index) : i64
%9 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%10 = llvm.extractvalue %9[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
%12 = llvm.ptrtoint %11 : !llvm.ptr to i64
%13 = llvm.and %12, %3 : i64
%14 = llvm.icmp "eq" %13, %8 : i64
"llvm.intr.assume"(%14) : (i1) -> ()
%15 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%16 = llvm.extractvalue %15[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%17 = llvm.getelementptr %16[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
%18 = llvm.load %17 : !llvm.ptr -> !llvm.ptr
%19 = llvm.getelementptr %18[16777216] : (!llvm.ptr) -> !llvm.ptr, f32
%20 = llvm.ptrtoint %19 : !llvm.ptr to i64
%21 = llvm.and %20, %3 : i64
%22 = llvm.icmp "eq" %21, %8 : i64
"llvm.intr.assume"(%22) : (i1) -> ()
%23 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%24 = llvm.extractvalue %23[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%25 = llvm.zext %24 : i32 to i64
%26 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%27 = llvm.extractvalue %26[4] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%28 = llvm.zext %27 : i32 to i64
%29 = llvm.extractvalue %23[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%30 = llvm.zext %29 : i32 to i64
%31 = llvm.extractvalue %26[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%32 = llvm.zext %31 : i32 to i64
%33 = llvm.mul %30, %5 : i64
%34 = llvm.mul %32, %5 : i64
%35 = llvm.mul %25, %7 : i64
%36 = llvm.mul %28, %7 : i64
llvm.br ^bb1(%33 : i64)
^bb1(%37: i64): // 2 preds: ^bb0, ^bb8
%38 = llvm.icmp "slt" %37, %7 : i64
llvm.cond_br %38, ^bb2(%35 : i64), ^bb9
^bb2(%39: i64): // 2 preds: ^bb1, ^bb7
%40 = llvm.icmp "slt" %39, %6 : i64
llvm.cond_br %40, ^bb3(%8 : i64), ^bb8
^bb3(%41: i64): // 2 preds: ^bb2, ^bb6
%42 = llvm.icmp "slt" %41, %5 : i64
llvm.cond_br %42, ^bb4(%8 : i64), ^bb7
^bb4(%43: i64): // 2 preds: ^bb3, ^bb5
%44 = llvm.icmp "slt" %43, %7 : i64
llvm.cond_br %44, ^bb5, ^bb6
^bb5: // pred: ^bb4
%45 = llvm.add %39, %43 : i64
%46 = llvm.mul %37, %1 : i64
%47 = llvm.mul %41, %1 : i64
%48 = llvm.add %46, %47 : i64
%49 = llvm.mul %45, %6 : i64
%50 = llvm.add %49, %48 : i64
%51 = llvm.getelementptr %11[%50] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%52 = llvm.load %51 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%53 = llvm.add %37, %41 : i64
%54 = llvm.mul %53, %2 : i64
%55 = llvm.mul %45, %1 : i64
%56 = llvm.add %54, %55 : i64
%57 = llvm.add %56, %8 : i64
%58 = llvm.add %57, %8 : i64
%59 = llvm.getelementptr %19[%58] : (!llvm.ptr, i64) -> !llvm.ptr, f32
llvm.store %52, %59 {alignment = 4 : i64} : vector<8xf32>, !llvm.ptr
%60 = llvm.add %43, %4 : i64
llvm.br ^bb4(%60 : i64)
^bb6: // pred: ^bb4
%61 = llvm.add %41, %4 : i64
llvm.br ^bb3(%61 : i64)
^bb7: // pred: ^bb3
%62 = llvm.add %39, %36 : i64
llvm.br ^bb2(%62 : i64)
^bb8: // pred: ^bb2
%63 = llvm.add %37, %34 : i64
llvm.br ^bb1(%63 : i64)
^bb9: // pred: ^bb1
llvm.return %0 : i32
}
llvm.func @simpul_mul_dispatch_2_pack_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
llvm.return %0 : i32
}
llvm.func @simpul_mul_dispatch_3_mmt4d_512x512x4096x8x8x1_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(64 : index) : i64
%2 = llvm.mlir.constant(63 : index) : i64
%3 = llvm.mlir.constant(8 : index) : i64
%4 = llvm.mlir.constant(32768 : index) : i64
%5 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x1x8x8xf32>) : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%6 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x8x8xf32>) : !llvm.array<1 x array<8 x vector<8xf32>>>
%7 = llvm.mlir.constant(7 : index) : i64
%8 = llvm.mlir.constant(6 : index) : i64
%9 = llvm.mlir.constant(5 : index) : i64
%10 = llvm.mlir.constant(4 : index) : i64
%11 = llvm.mlir.constant(3 : index) : i64
%12 = llvm.mlir.constant(2 : index) : i64
%13 = llvm.mlir.constant(dense<0.000000e+00> : vector<8x8xf32>) : !llvm.array<8 x vector<8xf32>>
%14 = llvm.mlir.constant(1 : index) : i64
%15 = llvm.mlir.constant(4096 : index) : i64
%16 = llvm.mlir.constant(512 : index) : i64
%17 = llvm.mlir.constant(0 : index) : i64
%18 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%19 = llvm.extractvalue %18[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%20 = llvm.load %19 : !llvm.ptr -> !llvm.ptr
%21 = llvm.ptrtoint %20 : !llvm.ptr to i64
%22 = llvm.and %21, %2 : i64
%23 = llvm.icmp "eq" %22, %17 : i64
"llvm.intr.assume"(%23) : (i1) -> ()
%24 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%25 = llvm.extractvalue %24[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%26 = llvm.load %25 : !llvm.ptr -> !llvm.ptr
%27 = llvm.getelementptr %26[16777216] : (!llvm.ptr) -> !llvm.ptr, f32
%28 = llvm.ptrtoint %27 : !llvm.ptr to i64
%29 = llvm.and %28, %2 : i64
%30 = llvm.icmp "eq" %29, %17 : i64
"llvm.intr.assume"(%30) : (i1) -> ()
%31 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%32 = llvm.extractvalue %31[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%33 = llvm.getelementptr %32[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
%34 = llvm.load %33 : !llvm.ptr -> !llvm.ptr
%35 = llvm.getelementptr %34[33554432] : (!llvm.ptr) -> !llvm.ptr, f32
%36 = llvm.ptrtoint %35 : !llvm.ptr to i64
%37 = llvm.and %36, %2 : i64
%38 = llvm.icmp "eq" %37, %17 : i64
"llvm.intr.assume"(%38) : (i1) -> ()
%39 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%40 = llvm.extractvalue %39[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%41 = llvm.zext %40 : i32 to i64
%42 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%43 = llvm.extractvalue %42[4] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%44 = llvm.zext %43 : i32 to i64
%45 = llvm.extractvalue %39[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%46 = llvm.zext %45 : i32 to i64
%47 = llvm.extractvalue %42[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%48 = llvm.zext %47 : i32 to i64
llvm.br ^bb1(%46 : i64)
^bb1(%49: i64): // 2 preds: ^bb0, ^bb7
%50 = llvm.icmp "slt" %49, %16 : i64
llvm.cond_br %50, ^bb2(%41 : i64), ^bb8
^bb2(%51: i64): // 2 preds: ^bb1, ^bb6
%52 = llvm.icmp "slt" %51, %16 : i64
llvm.cond_br %52, ^bb3, ^bb7
^bb3: // pred: ^bb2
%53 = llvm.mul %49, %4 : i64
%54 = llvm.mul %51, %1 : i64
%55 = llvm.add %53, %54 : i64
%56 = llvm.mul %17, %3 : i64
%57 = llvm.add %55, %56 : i64
%58 = llvm.add %57, %17 : i64
%59 = llvm.getelementptr %35[%58] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%60 = llvm.load %59 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%61 = llvm.insertvalue %60, %13[0] : !llvm.array<8 x vector<8xf32>>
%62 = llvm.mul %14, %3 : i64
%63 = llvm.add %55, %62 : i64
%64 = llvm.add %63, %17 : i64
%65 = llvm.getelementptr %35[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%66 = llvm.load %65 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%67 = llvm.insertvalue %66, %61[1] : !llvm.array<8 x vector<8xf32>>
%68 = llvm.mul %12, %3 : i64
%69 = llvm.add %55, %68 : i64
%70 = llvm.add %69, %17 : i64
%71 = llvm.getelementptr %35[%70] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%72 = llvm.load %71 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%73 = llvm.insertvalue %72, %67[2] : !llvm.array<8 x vector<8xf32>>
%74 = llvm.mul %11, %3 : i64
%75 = llvm.add %55, %74 : i64
%76 = llvm.add %75, %17 : i64
%77 = llvm.getelementptr %35[%76] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%78 = llvm.load %77 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%79 = llvm.insertvalue %78, %73[3] : !llvm.array<8 x vector<8xf32>>
%80 = llvm.mul %10, %3 : i64
%81 = llvm.add %55, %80 : i64
%82 = llvm.add %81, %17 : i64
%83 = llvm.getelementptr %35[%82] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%84 = llvm.load %83 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%85 = llvm.insertvalue %84, %79[4] : !llvm.array<8 x vector<8xf32>>
%86 = llvm.mul %9, %3 : i64
%87 = llvm.add %55, %86 : i64
%88 = llvm.add %87, %17 : i64
%89 = llvm.getelementptr %35[%88] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%90 = llvm.load %89 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%91 = llvm.insertvalue %90, %85[5] : !llvm.array<8 x vector<8xf32>>
%92 = llvm.mul %8, %3 : i64
%93 = llvm.add %55, %92 : i64
%94 = llvm.add %93, %17 : i64
%95 = llvm.getelementptr %35[%94] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%96 = llvm.load %95 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%97 = llvm.insertvalue %96, %91[6] : !llvm.array<8 x vector<8xf32>>
%98 = llvm.mul %7, %3 : i64
%99 = llvm.add %55, %98 : i64
%100 = llvm.add %99, %17 : i64
%101 = llvm.getelementptr %35[%100] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%102 = llvm.load %101 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%103 = llvm.insertvalue %102, %97[7] : !llvm.array<8 x vector<8xf32>>
%104 = llvm.insertvalue %103, %6[0] : !llvm.array<1 x array<8 x vector<8xf32>>>
%105 = llvm.insertvalue %104, %5[0] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
llvm.br ^bb4(%17, %105 : i64, !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>)
^bb4(%106: i64, %107: !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>): // 2 preds: ^bb3, ^bb5
%108 = llvm.icmp "slt" %106, %15 : i64
llvm.cond_br %108, ^bb5, ^bb6
^bb5: // pred: ^bb4
%109 = llvm.mul %51, %4 : i64
%110 = llvm.mul %106, %3 : i64
%111 = llvm.add %109, %110 : i64
%112 = llvm.add %111, %17 : i64
%113 = llvm.add %112, %17 : i64
%114 = llvm.getelementptr %27[%113] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%115 = llvm.load %114 {alignment = 4 : i64} : !llvm.ptr -> vector<8xf32>
%116 = llvm.add %53, %110 : i64
%117 = llvm.add %116, %17 : i64
%118 = llvm.add %117, %17 : i64
%119 = llvm.getelementptr %20[%118] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%120 = llvm.load %119 : !llvm.ptr -> f32
%121 = llvm.mlir.undef : vector<8xf32>
%122 = llvm.insertelement %120, %121[%0 : i32] : vector<8xf32>
%123 = llvm.shufflevector %122, %121 [0, 0, 0, 0, 0, 0, 0, 0] : vector<8xf32>
%124 = llvm.extractvalue %107[0, 0, 0] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%125 = llvm.intr.fmuladd(%123, %115, %124) : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
%126 = llvm.insertvalue %125, %13[0] : !llvm.array<8 x vector<8xf32>>
%127 = llvm.add %116, %14 : i64
%128 = llvm.add %127, %17 : i64
%129 = llvm.getelementptr %20[%128] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%130 = llvm.load %129 : !llvm.ptr -> f32
%131 = llvm.insertelement %130, %121[%0 : i32] : vector<8xf32>
%132 = llvm.shufflevector %131, %121 [0, 0, 0, 0, 0, 0, 0, 0] : vector<8xf32>
%133 = llvm.extractvalue %107[0, 0, 1] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%134 = llvm.intr.fmuladd(%132, %115, %133) : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
%135 = llvm.insertvalue %134, %126[1] : !llvm.array<8 x vector<8xf32>>
%136 = llvm.add %116, %12 : i64
%137 = llvm.add %136, %17 : i64
%138 = llvm.getelementptr %20[%137] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%139 = llvm.load %138 : !llvm.ptr -> f32
%140 = llvm.insertelement %139, %121[%0 : i32] : vector<8xf32>
%141 = llvm.shufflevector %140, %121 [0, 0, 0, 0, 0, 0, 0, 0] : vector<8xf32>
%142 = llvm.extractvalue %107[0, 0, 2] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%143 = llvm.intr.fmuladd(%141, %115, %142) : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
%144 = llvm.insertvalue %143, %135[2] : !llvm.array<8 x vector<8xf32>>
%145 = llvm.add %116, %11 : i64
%146 = llvm.add %145, %17 : i64
%147 = llvm.getelementptr %20[%146] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%148 = llvm.load %147 : !llvm.ptr -> f32
%149 = llvm.insertelement %148, %121[%0 : i32] : vector<8xf32>
%150 = llvm.shufflevector %149, %121 [0, 0, 0, 0, 0, 0, 0, 0] : vector<8xf32>
%151 = llvm.extractvalue %107[0, 0, 3] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%152 = llvm.intr.fmuladd(%150, %115, %151) : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
%153 = llvm.insertvalue %152, %144[3] : !llvm.array<8 x vector<8xf32>>
%154 = llvm.add %116, %10 : i64
%155 = llvm.add %154, %17 : i64
%156 = llvm.getelementptr %20[%155] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%157 = llvm.load %156 : !llvm.ptr -> f32
%158 = llvm.insertelement %157, %121[%0 : i32] : vector<8xf32>
%159 = llvm.shufflevector %158, %121 [0, 0, 0, 0, 0, 0, 0, 0] : vector<8xf32>
%160 = llvm.extractvalue %107[0, 0, 4] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%161 = llvm.intr.fmuladd(%159, %115, %160) : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
%162 = llvm.insertvalue %161, %153[4] : !llvm.array<8 x vector<8xf32>>
%163 = llvm.add %116, %9 : i64
%164 = llvm.add %163, %17 : i64
%165 = llvm.getelementptr %20[%164] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%166 = llvm.load %165 : !llvm.ptr -> f32
%167 = llvm.insertelement %166, %121[%0 : i32] : vector<8xf32>
%168 = llvm.shufflevector %167, %121 [0, 0, 0, 0, 0, 0, 0, 0] : vector<8xf32>
%169 = llvm.extractvalue %107[0, 0, 5] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%170 = llvm.intr.fmuladd(%168, %115, %169) : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
%171 = llvm.insertvalue %170, %162[5] : !llvm.array<8 x vector<8xf32>>
%172 = llvm.add %116, %8 : i64
%173 = llvm.add %172, %17 : i64
%174 = llvm.getelementptr %20[%173] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%175 = llvm.load %174 : !llvm.ptr -> f32
%176 = llvm.insertelement %175, %121[%0 : i32] : vector<8xf32>
%177 = llvm.shufflevector %176, %121 [0, 0, 0, 0, 0, 0, 0, 0] : vector<8xf32>
%178 = llvm.extractvalue %107[0, 0, 6] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%179 = llvm.intr.fmuladd(%177, %115, %178) : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
%180 = llvm.insertvalue %179, %171[6] : !llvm.array<8 x vector<8xf32>>
%181 = llvm.add %116, %7 : i64
%182 = llvm.add %181, %17 : i64
%183 = llvm.getelementptr %20[%182] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%184 = llvm.load %183 : !llvm.ptr -> f32
%185 = llvm.insertelement %184, %121[%0 : i32] : vector<8xf32>
%186 = llvm.shufflevector %185, %121 [0, 0, 0, 0, 0, 0, 0, 0] : vector<8xf32>
%187 = llvm.extractvalue %107[0, 0, 7] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%188 = llvm.intr.fmuladd(%186, %115, %187) : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
%189 = llvm.insertvalue %188, %180[7] : !llvm.array<8 x vector<8xf32>>
%190 = llvm.insertvalue %189, %6[0] : !llvm.array<1 x array<8 x vector<8xf32>>>
%191 = llvm.insertvalue %190, %5[0] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
%192 = llvm.add %106, %14 : i64
llvm.br ^bb4(%192, %191 : i64, !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>)
^bb6: // pred: ^bb4
%193 = llvm.extractvalue %107[0, 0, 0] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
llvm.store %193, %59 {alignment = 4 : i64} : vector<8xf32>, !llvm.ptr
%194 = llvm.extractvalue %107[0, 0, 1] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
llvm.store %194, %65 {alignment = 4 : i64} : vector<8xf32>, !llvm.ptr
%195 = llvm.extractvalue %107[0, 0, 2] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
llvm.store %195, %71 {alignment = 4 : i64} : vector<8xf32>, !llvm.ptr
%196 = llvm.extractvalue %107[0, 0, 3] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
llvm.store %196, %77 {alignment = 4 : i64} : vector<8xf32>, !llvm.ptr
%197 = llvm.extractvalue %107[0, 0, 4] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
llvm.store %197, %83 {alignment = 4 : i64} : vector<8xf32>, !llvm.ptr
%198 = llvm.extractvalue %107[0, 0, 5] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
llvm.store %198, %89 {alignment = 4 : i64} : vector<8xf32>, !llvm.ptr
%199 = llvm.extractvalue %107[0, 0, 6] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
llvm.store %199, %95 {alignment = 4 : i64} : vector<8xf32>, !llvm.ptr
%200 = llvm.extractvalue %107[0, 0, 7] : !llvm.array<1 x array<1 x array<8 x vector<8xf32>>>>
llvm.store %200, %101 {alignment = 4 : i64} : vector<8xf32>, !llvm.ptr
%201 = llvm.add %51, %44 : i64
llvm.br ^bb2(%201 : i64)
^bb7: // pred: ^bb2
%202 = llvm.add %49, %48 : i64
llvm.br ^bb1(%202 : i64)
^bb8: // pred: ^bb1
llvm.return %0 : i32
}
llvm.func @simpul_mul_dispatch_4_unpack_f32(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(-1 : index) : i64
%2 = llvm.mlir.constant(63 : index) : i64
%3 = llvm.mlir.constant(64 : index) : i64
%4 = llvm.mlir.constant(32768 : index) : i64
%5 = llvm.mlir.constant(1 : index) : i64
%6 = llvm.mlir.constant(8 : index) : i64
%7 = llvm.mlir.constant(256 : index) : i64
%8 = llvm.mlir.constant(4096 : index) : i64
%9 = llvm.mlir.constant(0 : index) : i64
%10 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%11 = llvm.extractvalue %10[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%12 = llvm.load %11 : !llvm.ptr -> !llvm.ptr
%13 = llvm.getelementptr %12[33554432] : (!llvm.ptr) -> !llvm.ptr, f32
%14 = llvm.ptrtoint %13 : !llvm.ptr to i64
%15 = llvm.and %14, %2 : i64
%16 = llvm.icmp "eq" %15, %9 : i64
"llvm.intr.assume"(%16) : (i1) -> ()
%17 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%18 = llvm.extractvalue %17[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%19 = llvm.getelementptr %18[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
%20 = llvm.load %19 : !llvm.ptr -> !llvm.ptr
%21 = llvm.ptrtoint %20 : !llvm.ptr to i64
%22 = llvm.and %21, %2 : i64
%23 = llvm.icmp "eq" %22, %9 : i64
"llvm.intr.assume"(%23) : (i1) -> ()
%24 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%25 = llvm.extractvalue %24[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%26 = llvm.zext %25 : i32 to i64
%27 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%28 = llvm.extractvalue %27[4] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%29 = llvm.zext %28 : i32 to i64
%30 = llvm.extractvalue %24[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%31 = llvm.zext %30 : i32 to i64
%32 = llvm.extractvalue %27[5] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%33 = llvm.zext %32 : i32 to i64
%34 = llvm.mul %31, %7 : i64
%35 = llvm.mul %33, %7 : i64
%36 = llvm.mul %26, %8 : i64
%37 = llvm.mul %29, %8 : i64
llvm.br ^bb1(%34 : i64)
^bb1(%38: i64): // 2 preds: ^bb0, ^bb12
%39 = llvm.icmp "slt" %38, %8 : i64
llvm.cond_br %39, ^bb2(%36 : i64), ^bb13
^bb2(%40: i64): // 2 preds: ^bb1, ^bb11
%41 = llvm.icmp "slt" %40, %8 : i64
llvm.cond_br %41, ^bb3(%9 : i64), ^bb12
^bb3(%42: i64): // 2 preds: ^bb2, ^bb10
%43 = llvm.icmp "slt" %42, %7 : i64
llvm.cond_br %43, ^bb4(%9 : i64), ^bb11
^bb4(%44: i64): // 2 preds: ^bb3, ^bb9
%45 = llvm.icmp "slt" %44, %8 : i64
llvm.cond_br %45, ^bb5(%9 : i64), ^bb10
^bb5(%46: i64): // 2 preds: ^bb4, ^bb8
%47 = llvm.icmp "slt" %46, %6 : i64
llvm.cond_br %47, ^bb6(%9 : i64), ^bb9
^bb6(%48: i64): // 2 preds: ^bb5, ^bb7
%49 = llvm.icmp "slt" %48, %6 : i64
llvm.cond_br %49, ^bb7, ^bb8
^bb7: // pred: ^bb6
%50 = llvm.icmp "slt" %38, %9 : i64
%51 = llvm.sub %1, %38 : i64
%52 = llvm.select %50, %51, %38 : i1, i64
%53 = llvm.sdiv %52, %6 : i64
%54 = llvm.sub %1, %53 : i64
%55 = llvm.select %50, %54, %53 : i1, i64
%56 = llvm.icmp "slt" %42, %9 : i64
%57 = llvm.sub %1, %42 : i64
%58 = llvm.select %56, %57, %42 : i1, i64
%59 = llvm.sdiv %58, %6 : i64
%60 = llvm.sub %1, %59 : i64
%61 = llvm.select %56, %60, %59 : i1, i64
%62 = llvm.add %55, %61 : i64
%63 = llvm.icmp "slt" %40, %9 : i64
%64 = llvm.sub %1, %40 : i64
%65 = llvm.select %63, %64, %40 : i1, i64
%66 = llvm.sdiv %65, %6 : i64
%67 = llvm.sub %1, %66 : i64
%68 = llvm.select %63, %67, %66 : i1, i64
%69 = llvm.icmp "slt" %44, %9 : i64
%70 = llvm.sub %1, %44 : i64
%71 = llvm.select %69, %70, %44 : i1, i64
%72 = llvm.sdiv %71, %6 : i64
%73 = llvm.sub %1, %72 : i64
%74 = llvm.select %69, %73, %72 : i1, i64
%75 = llvm.add %68, %74 : i64
%76 = llvm.mul %62, %4 : i64
%77 = llvm.mul %75, %3 : i64
%78 = llvm.add %76, %77 : i64
%79 = llvm.mul %46, %6 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.add %80, %48 : i64
%82 = llvm.getelementptr %13[%81] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%83 = llvm.load %82 : !llvm.ptr -> f32
%84 = llvm.add %38, %42 : i64
%85 = llvm.add %84, %46 : i64
%86 = llvm.add %40, %44 : i64
%87 = llvm.add %86, %48 : i64
%88 = llvm.mul %85, %8 : i64
%89 = llvm.add %88, %87 : i64
%90 = llvm.getelementptr %20[%89] : (!llvm.ptr, i64) -> !llvm.ptr, f32
llvm.store %83, %90 : f32, !llvm.ptr
%91 = llvm.add %48, %5 : i64
llvm.br ^bb6(%91 : i64)
^bb8: // pred: ^bb6
%92 = llvm.add %46, %5 : i64
llvm.br ^bb5(%92 : i64)
^bb9: // pred: ^bb5
%93 = llvm.add %44, %6 : i64
llvm.br ^bb4(%93 : i64)
^bb10: // pred: ^bb4
%94 = llvm.add %42, %6 : i64
llvm.br ^bb3(%94 : i64)
^bb11: // pred: ^bb3
%95 = llvm.add %40, %37 : i64
llvm.br ^bb2(%95 : i64)
^bb12: // pred: ^bb2
%96 = llvm.add %38, %35 : i64
llvm.br ^bb1(%96 : i64)
^bb13: // pred: ^bb1
llvm.return %0 : i32
}
}
}
}
// -----// IR Dump After SerializeExecutablesPass (iree-hal-serialize-executables) //----- //
hal.executable private @matmul_input_linked_llvm_cpu {
hal.executable.binary public @system_elf_x86_64 attributes {data = dense<"vector<11192xi8>, format = "system-elf-x86_64", mime_type = "application/x-elf"}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment