Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ScottTodd/9ca792a3047138826aa79c2a377a3846 to your computer and use it in GitHub Desktop.
Save ScottTodd/9ca792a3047138826aa79c2a377a3846 to your computer and use it in GitHub Desktop.
// https://github.com/openxla/iree/issues/15800
// -----// IR Dump Before mlir::iree_compiler::IREE::HAL::SerializeTargetExecutablesPass (iree-hal-serialize-target-executables) //----- //
hal.executable public @matmul_accumulate_123x456xf16_times_456x789xf16_into_123x789xf16_dispatch_0 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>) {
hal.executable.export public @matmul_accumulate_123x456xf16_times_456x789xf16_into_123x789xf16_dispatch_0_matmul_123x789x456_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>} {
^bb0(%arg0: !hal.device):
%c7 = arith.constant 7 : index
%c3 = arith.constant 3 : index
%c1 = arith.constant 1 : index
hal.return %c7, %c3, %c1 : index, index, index
}
builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
llvm.func @matmul_accumulate_123x456xf16_times_456x789xf16_into_123x789xf16_dispatch_0_matmul_123x789x456_f16(%arg0: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
%0 = llvm.mlir.constant(0 : i64) : i64
%1 = llvm.mlir.constant(0 : i32) : i32
%2 = llvm.mlir.constant(7 : index) : i64
%3 = llvm.mlir.constant(6 : index) : i64
%4 = llvm.mlir.constant(5 : index) : i64
%5 = llvm.mlir.constant(4 : index) : i64
%6 = llvm.mlir.constant(3 : index) : i64
%7 = llvm.mlir.constant(2 : index) : i64
%8 = llvm.mlir.constant(41 : index) : i64
%9 = llvm.mlir.constant(-1 : index) : i64
%10 = llvm.mlir.constant(-128 : index) : i64
%11 = llvm.mlir.constant(789 : index) : i64
%12 = llvm.mlir.constant(63 : index) : i64
%13 = llvm.mlir.constant(dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]> : vector<32xi64>) : vector<32xi64>
%14 = llvm.mlir.constant(dense<0.000000e+00> : vector<32xf16>) : vector<32xf16>
%15 = llvm.mlir.constant(dense<0.000000e+00> : vector<8x32xf16>) : !llvm.array<8 x vector<32xf16>>
%16 = llvm.mlir.constant(40 : index) : i64
%17 = llvm.mlir.constant(456 : index) : i64
%18 = llvm.mlir.constant(1 : index) : i64
%19 = llvm.mlir.constant(32 : index) : i64
%20 = llvm.mlir.constant(8 : index) : i64
%21 = llvm.mlir.constant(128 : index) : i64
%22 = llvm.mlir.constant(0 : index) : i64
%23 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%24 = llvm.extractvalue %23[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%25 = llvm.load %24 : !llvm.ptr -> !llvm.ptr
%26 = llvm.ptrtoint %25 : !llvm.ptr to i64
%27 = llvm.and %26, %12 : i64
%28 = llvm.icmp "eq" %27, %22 : i64
"llvm.intr.assume"(%28) : (i1) -> ()
%29 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%30 = llvm.extractvalue %29[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%31 = llvm.getelementptr %30[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
%32 = llvm.load %31 : !llvm.ptr -> !llvm.ptr
%33 = llvm.ptrtoint %32 : !llvm.ptr to i64
%34 = llvm.and %33, %12 : i64
%35 = llvm.icmp "eq" %34, %22 : i64
"llvm.intr.assume"(%35) : (i1) -> ()
%36 = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%37 = llvm.extractvalue %36[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr, ptr, ptr)>
%38 = llvm.getelementptr %37[2] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
%39 = llvm.load %38 : !llvm.ptr -> !llvm.ptr
%40 = llvm.ptrtoint %39 : !llvm.ptr to i64
%41 = llvm.and %40, %12 : i64
%42 = llvm.icmp "eq" %41, %22 : i64
"llvm.intr.assume"(%42) : (i1) -> ()
%43 = llvm.load %arg2 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%44 = llvm.extractvalue %43[0] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%45 = llvm.zext %44 : i32 to i64
%46 = llvm.extractvalue %43[1] : !llvm.struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr, i32)>
%47 = llvm.zext %46 : i32 to i64
%48 = llvm.mul %45, %10 : i64
%49 = llvm.add %48, %11 : i64
%50 = llvm.icmp "slt" %49, %21 : i64
%51 = llvm.select %50, %49, %21 : i1, i64
llvm.br ^bb1(%22 : i64)
^bb1(%52: i64): // 2 preds: ^bb0, ^bb20
%53 = llvm.icmp "slt" %52, %51 : i64
llvm.cond_br %53, ^bb2, ^bb21
^bb2: // pred: ^bb1
%54 = llvm.mul %52, %9 : i64
%55 = llvm.add %51, %54 : i64
%56 = llvm.icmp "sgt" %55, %21 : i64
%57 = llvm.select %56, %21, %55 : i1, i64
%58 = llvm.icmp "slt" %57, %22 : i64
%59 = llvm.sub %9, %57 : i64
%60 = llvm.select %58, %59, %57 : i1, i64
%61 = llvm.sdiv %60, %19 : i64
%62 = llvm.sub %9, %61 : i64
%63 = llvm.select %58, %62, %61 : i1, i64
%64 = llvm.mul %63, %19 : i64
llvm.br ^bb3(%22 : i64)
^bb3(%65: i64): // 2 preds: ^bb2, ^bb14
%66 = llvm.icmp "slt" %65, %16 : i64
llvm.cond_br %66, ^bb4(%22 : i64), ^bb15(%22 : i64)
^bb4(%67: i64): // 2 preds: ^bb3, ^bb8
%68 = llvm.icmp "slt" %67, %64 : i64
llvm.cond_br %68, ^bb5, ^bb9(%64 : i64)
^bb5: // pred: ^bb4
%69 = llvm.add %67, %52 : i64
%70 = llvm.mul %47, %8 : i64
%71 = llvm.add %65, %70 : i64
%72 = llvm.mul %45, %21 : i64
%73 = llvm.add %72, %52 : i64
%74 = llvm.add %73, %67 : i64
%75 = llvm.mul %71, %11 : i64
%76 = llvm.add %75, %74 : i64
%77 = llvm.getelementptr %39[%76] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%78 = llvm.load %77 {alignment = 2 : i64} : !llvm.ptr -> vector<32xf16>
%79 = llvm.insertvalue %78, %15[0] : !llvm.array<8 x vector<32xf16>>
%80 = llvm.add %70, %65 : i64
%81 = llvm.add %80, %18 : i64
%82 = llvm.mul %81, %11 : i64
%83 = llvm.add %82, %74 : i64
%84 = llvm.getelementptr %39[%83] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%85 = llvm.load %84 {alignment = 2 : i64} : !llvm.ptr -> vector<32xf16>
%86 = llvm.insertvalue %85, %79[1] : !llvm.array<8 x vector<32xf16>>
%87 = llvm.add %80, %7 : i64
%88 = llvm.mul %87, %11 : i64
%89 = llvm.add %88, %74 : i64
%90 = llvm.getelementptr %39[%89] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%91 = llvm.load %90 {alignment = 2 : i64} : !llvm.ptr -> vector<32xf16>
%92 = llvm.insertvalue %91, %86[2] : !llvm.array<8 x vector<32xf16>>
%93 = llvm.add %80, %6 : i64
%94 = llvm.mul %93, %11 : i64
%95 = llvm.add %94, %74 : i64
%96 = llvm.getelementptr %39[%95] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%97 = llvm.load %96 {alignment = 2 : i64} : !llvm.ptr -> vector<32xf16>
%98 = llvm.insertvalue %97, %92[3] : !llvm.array<8 x vector<32xf16>>
%99 = llvm.add %80, %5 : i64
%100 = llvm.mul %99, %11 : i64
%101 = llvm.add %100, %74 : i64
%102 = llvm.getelementptr %39[%101] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%103 = llvm.load %102 {alignment = 2 : i64} : !llvm.ptr -> vector<32xf16>
%104 = llvm.insertvalue %103, %98[4] : !llvm.array<8 x vector<32xf16>>
%105 = llvm.add %80, %4 : i64
%106 = llvm.mul %105, %11 : i64
%107 = llvm.add %106, %74 : i64
%108 = llvm.getelementptr %39[%107] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%109 = llvm.load %108 {alignment = 2 : i64} : !llvm.ptr -> vector<32xf16>
%110 = llvm.insertvalue %109, %104[5] : !llvm.array<8 x vector<32xf16>>
%111 = llvm.add %80, %3 : i64
%112 = llvm.mul %111, %11 : i64
%113 = llvm.add %112, %74 : i64
%114 = llvm.getelementptr %39[%113] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%115 = llvm.load %114 {alignment = 2 : i64} : !llvm.ptr -> vector<32xf16>
%116 = llvm.insertvalue %115, %110[6] : !llvm.array<8 x vector<32xf16>>
%117 = llvm.add %80, %2 : i64
%118 = llvm.mul %117, %11 : i64
%119 = llvm.add %118, %74 : i64
%120 = llvm.getelementptr %39[%119] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%121 = llvm.load %120 {alignment = 2 : i64} : !llvm.ptr -> vector<32xf16>
%122 = llvm.insertvalue %121, %116[7] : !llvm.array<8 x vector<32xf16>>
llvm.br ^bb6(%22, %122 : i64, !llvm.array<8 x vector<32xf16>>)
^bb6(%123: i64, %124: !llvm.array<8 x vector<32xf16>>): // 2 preds: ^bb5, ^bb7
%125 = llvm.icmp "slt" %123, %17 : i64
llvm.cond_br %125, ^bb7, ^bb8
^bb7: // pred: ^bb6
%126 = llvm.add %69, %72 : i64
%127 = llvm.mul %123, %11 : i64
%128 = llvm.add %127, %126 : i64
%129 = llvm.getelementptr %32[%128] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%130 = llvm.load %129 {alignment = 2 : i64} : !llvm.ptr -> vector<32xf16>
%131 = llvm.mul %71, %17 : i64
%132 = llvm.add %131, %123 : i64
%133 = llvm.getelementptr %25[%132] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%134 = llvm.load %133 : !llvm.ptr -> f16
%135 = llvm.mlir.undef : vector<32xf16>
%136 = llvm.insertelement %134, %135[%1 : i32] : vector<32xf16>
%137 = llvm.shufflevector %136, %135 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%138 = llvm.extractvalue %124[0] : !llvm.array<8 x vector<32xf16>>
%139 = llvm.intr.fmuladd(%137, %130, %138) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%140 = llvm.insertvalue %139, %15[0] : !llvm.array<8 x vector<32xf16>>
%141 = llvm.mul %81, %17 : i64
%142 = llvm.add %141, %123 : i64
%143 = llvm.getelementptr %25[%142] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%144 = llvm.load %143 : !llvm.ptr -> f16
%145 = llvm.insertelement %144, %135[%1 : i32] : vector<32xf16>
%146 = llvm.shufflevector %145, %135 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%147 = llvm.extractvalue %124[1] : !llvm.array<8 x vector<32xf16>>
%148 = llvm.intr.fmuladd(%146, %130, %147) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%149 = llvm.insertvalue %148, %140[1] : !llvm.array<8 x vector<32xf16>>
%150 = llvm.mul %87, %17 : i64
%151 = llvm.add %150, %123 : i64
%152 = llvm.getelementptr %25[%151] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%153 = llvm.load %152 : !llvm.ptr -> f16
%154 = llvm.insertelement %153, %135[%1 : i32] : vector<32xf16>
%155 = llvm.shufflevector %154, %135 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%156 = llvm.extractvalue %124[2] : !llvm.array<8 x vector<32xf16>>
%157 = llvm.intr.fmuladd(%155, %130, %156) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%158 = llvm.insertvalue %157, %149[2] : !llvm.array<8 x vector<32xf16>>
%159 = llvm.mul %93, %17 : i64
%160 = llvm.add %159, %123 : i64
%161 = llvm.getelementptr %25[%160] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%162 = llvm.load %161 : !llvm.ptr -> f16
%163 = llvm.insertelement %162, %135[%1 : i32] : vector<32xf16>
%164 = llvm.shufflevector %163, %135 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%165 = llvm.extractvalue %124[3] : !llvm.array<8 x vector<32xf16>>
%166 = llvm.intr.fmuladd(%164, %130, %165) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%167 = llvm.insertvalue %166, %158[3] : !llvm.array<8 x vector<32xf16>>
%168 = llvm.mul %99, %17 : i64
%169 = llvm.add %168, %123 : i64
%170 = llvm.getelementptr %25[%169] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%171 = llvm.load %170 : !llvm.ptr -> f16
%172 = llvm.insertelement %171, %135[%1 : i32] : vector<32xf16>
%173 = llvm.shufflevector %172, %135 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%174 = llvm.extractvalue %124[4] : !llvm.array<8 x vector<32xf16>>
%175 = llvm.intr.fmuladd(%173, %130, %174) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%176 = llvm.insertvalue %175, %167[4] : !llvm.array<8 x vector<32xf16>>
%177 = llvm.mul %105, %17 : i64
%178 = llvm.add %177, %123 : i64
%179 = llvm.getelementptr %25[%178] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%180 = llvm.load %179 : !llvm.ptr -> f16
%181 = llvm.insertelement %180, %135[%1 : i32] : vector<32xf16>
%182 = llvm.shufflevector %181, %135 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%183 = llvm.extractvalue %124[5] : !llvm.array<8 x vector<32xf16>>
%184 = llvm.intr.fmuladd(%182, %130, %183) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%185 = llvm.insertvalue %184, %176[5] : !llvm.array<8 x vector<32xf16>>
%186 = llvm.mul %111, %17 : i64
%187 = llvm.add %186, %123 : i64
%188 = llvm.getelementptr %25[%187] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%189 = llvm.load %188 : !llvm.ptr -> f16
%190 = llvm.insertelement %189, %135[%1 : i32] : vector<32xf16>
%191 = llvm.shufflevector %190, %135 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%192 = llvm.extractvalue %124[6] : !llvm.array<8 x vector<32xf16>>
%193 = llvm.intr.fmuladd(%191, %130, %192) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%194 = llvm.insertvalue %193, %185[6] : !llvm.array<8 x vector<32xf16>>
%195 = llvm.mul %117, %17 : i64
%196 = llvm.add %195, %123 : i64
%197 = llvm.getelementptr %25[%196] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%198 = llvm.load %197 : !llvm.ptr -> f16
%199 = llvm.insertelement %198, %135[%1 : i32] : vector<32xf16>
%200 = llvm.shufflevector %199, %135 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%201 = llvm.extractvalue %124[7] : !llvm.array<8 x vector<32xf16>>
%202 = llvm.intr.fmuladd(%200, %130, %201) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%203 = llvm.insertvalue %202, %194[7] : !llvm.array<8 x vector<32xf16>>
%204 = llvm.add %123, %18 : i64
llvm.br ^bb6(%204, %203 : i64, !llvm.array<8 x vector<32xf16>>)
^bb8: // pred: ^bb6
%205 = llvm.extractvalue %124[0] : !llvm.array<8 x vector<32xf16>>
llvm.store %205, %77 {alignment = 2 : i64} : vector<32xf16>, !llvm.ptr
%206 = llvm.extractvalue %124[1] : !llvm.array<8 x vector<32xf16>>
llvm.store %206, %84 {alignment = 2 : i64} : vector<32xf16>, !llvm.ptr
%207 = llvm.extractvalue %124[2] : !llvm.array<8 x vector<32xf16>>
llvm.store %207, %90 {alignment = 2 : i64} : vector<32xf16>, !llvm.ptr
%208 = llvm.extractvalue %124[3] : !llvm.array<8 x vector<32xf16>>
llvm.store %208, %96 {alignment = 2 : i64} : vector<32xf16>, !llvm.ptr
%209 = llvm.extractvalue %124[4] : !llvm.array<8 x vector<32xf16>>
llvm.store %209, %102 {alignment = 2 : i64} : vector<32xf16>, !llvm.ptr
%210 = llvm.extractvalue %124[5] : !llvm.array<8 x vector<32xf16>>
llvm.store %210, %108 {alignment = 2 : i64} : vector<32xf16>, !llvm.ptr
%211 = llvm.extractvalue %124[6] : !llvm.array<8 x vector<32xf16>>
llvm.store %211, %114 {alignment = 2 : i64} : vector<32xf16>, !llvm.ptr
%212 = llvm.extractvalue %124[7] : !llvm.array<8 x vector<32xf16>>
llvm.store %212, %120 {alignment = 2 : i64} : vector<32xf16>, !llvm.ptr
%213 = llvm.add %67, %19 : i64
llvm.br ^bb4(%213 : i64)
^bb9(%214: i64): // 2 preds: ^bb4, ^bb13
%215 = llvm.icmp "slt" %214, %57 : i64
llvm.cond_br %215, ^bb10, ^bb14
^bb10: // pred: ^bb9
%216 = llvm.mul %214, %9 : i64
%217 = llvm.add %57, %216 : i64
%218 = llvm.mlir.undef : vector<32xi64>
%219 = llvm.insertelement %217, %218[%1 : i32] : vector<32xi64>
%220 = llvm.shufflevector %219, %218 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xi64>
%221 = llvm.icmp "sgt" %220, %13 : vector<32xi64>
%222 = llvm.mul %47, %8 : i64
%223 = llvm.add %65, %222 : i64
%224 = llvm.mul %45, %21 : i64
%225 = llvm.add %224, %52 : i64
%226 = llvm.add %225, %214 : i64
%227 = llvm.mul %223, %11 : i64
%228 = llvm.add %227, %226 : i64
%229 = llvm.getelementptr %39[%228] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%230 = llvm.intr.masked.load %229, %221, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%231 = llvm.insertvalue %230, %15[0] : !llvm.array<8 x vector<32xf16>>
%232 = llvm.add %222, %65 : i64
%233 = llvm.add %232, %18 : i64
%234 = llvm.mul %233, %11 : i64
%235 = llvm.add %234, %226 : i64
%236 = llvm.getelementptr %39[%235] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%237 = llvm.intr.masked.load %236, %221, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%238 = llvm.insertvalue %237, %231[1] : !llvm.array<8 x vector<32xf16>>
%239 = llvm.add %232, %7 : i64
%240 = llvm.mul %239, %11 : i64
%241 = llvm.add %240, %226 : i64
%242 = llvm.getelementptr %39[%241] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%243 = llvm.intr.masked.load %242, %221, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%244 = llvm.insertvalue %243, %238[2] : !llvm.array<8 x vector<32xf16>>
%245 = llvm.add %232, %6 : i64
%246 = llvm.mul %245, %11 : i64
%247 = llvm.add %246, %226 : i64
%248 = llvm.getelementptr %39[%247] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%249 = llvm.intr.masked.load %248, %221, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%250 = llvm.insertvalue %249, %244[3] : !llvm.array<8 x vector<32xf16>>
%251 = llvm.add %232, %5 : i64
%252 = llvm.mul %251, %11 : i64
%253 = llvm.add %252, %226 : i64
%254 = llvm.getelementptr %39[%253] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%255 = llvm.intr.masked.load %254, %221, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%256 = llvm.insertvalue %255, %250[4] : !llvm.array<8 x vector<32xf16>>
%257 = llvm.add %232, %4 : i64
%258 = llvm.mul %257, %11 : i64
%259 = llvm.add %258, %226 : i64
%260 = llvm.getelementptr %39[%259] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%261 = llvm.intr.masked.load %260, %221, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%262 = llvm.insertvalue %261, %256[5] : !llvm.array<8 x vector<32xf16>>
%263 = llvm.add %232, %3 : i64
%264 = llvm.mul %263, %11 : i64
%265 = llvm.add %264, %226 : i64
%266 = llvm.getelementptr %39[%265] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%267 = llvm.intr.masked.load %266, %221, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%268 = llvm.insertvalue %267, %262[6] : !llvm.array<8 x vector<32xf16>>
%269 = llvm.add %232, %2 : i64
%270 = llvm.mul %269, %11 : i64
%271 = llvm.add %270, %226 : i64
%272 = llvm.getelementptr %39[%271] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%273 = llvm.intr.masked.load %272, %221, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%274 = llvm.insertvalue %273, %268[7] : !llvm.array<8 x vector<32xf16>>
llvm.br ^bb11(%22, %274 : i64, !llvm.array<8 x vector<32xf16>>)
^bb11(%275: i64, %276: !llvm.array<8 x vector<32xf16>>): // 2 preds: ^bb10, ^bb12
%277 = llvm.icmp "slt" %275, %17 : i64
llvm.cond_br %277, ^bb12, ^bb13
^bb12: // pred: ^bb11
%278 = llvm.mul %275, %11 : i64
%279 = llvm.add %278, %226 : i64
%280 = llvm.getelementptr %32[%279] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%281 = llvm.intr.masked.load %280, %221, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%282 = llvm.mul %223, %17 : i64
%283 = llvm.add %282, %275 : i64
%284 = llvm.getelementptr %25[%283] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%285 = llvm.load %284 : !llvm.ptr -> f16
%286 = llvm.mlir.undef : vector<32xf16>
%287 = llvm.insertelement %285, %286[%1 : i32] : vector<32xf16>
%288 = llvm.shufflevector %287, %286 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%289 = llvm.extractvalue %276[0] : !llvm.array<8 x vector<32xf16>>
%290 = llvm.intr.fmuladd(%288, %281, %289) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%291 = llvm.select %221, %290, %289 : vector<32xi1>, vector<32xf16>
%292 = llvm.insertvalue %291, %15[0] : !llvm.array<8 x vector<32xf16>>
%293 = llvm.mul %233, %17 : i64
%294 = llvm.add %293, %275 : i64
%295 = llvm.getelementptr %25[%294] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%296 = llvm.load %295 : !llvm.ptr -> f16
%297 = llvm.insertelement %296, %286[%1 : i32] : vector<32xf16>
%298 = llvm.shufflevector %297, %286 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%299 = llvm.extractvalue %276[1] : !llvm.array<8 x vector<32xf16>>
%300 = llvm.intr.fmuladd(%298, %281, %299) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%301 = llvm.select %221, %300, %299 : vector<32xi1>, vector<32xf16>
%302 = llvm.insertvalue %301, %292[1] : !llvm.array<8 x vector<32xf16>>
%303 = llvm.mul %239, %17 : i64
%304 = llvm.add %303, %275 : i64
%305 = llvm.getelementptr %25[%304] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%306 = llvm.load %305 : !llvm.ptr -> f16
%307 = llvm.insertelement %306, %286[%1 : i32] : vector<32xf16>
%308 = llvm.shufflevector %307, %286 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%309 = llvm.extractvalue %276[2] : !llvm.array<8 x vector<32xf16>>
%310 = llvm.intr.fmuladd(%308, %281, %309) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%311 = llvm.select %221, %310, %309 : vector<32xi1>, vector<32xf16>
%312 = llvm.insertvalue %311, %302[2] : !llvm.array<8 x vector<32xf16>>
%313 = llvm.mul %245, %17 : i64
%314 = llvm.add %313, %275 : i64
%315 = llvm.getelementptr %25[%314] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%316 = llvm.load %315 : !llvm.ptr -> f16
%317 = llvm.insertelement %316, %286[%1 : i32] : vector<32xf16>
%318 = llvm.shufflevector %317, %286 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%319 = llvm.extractvalue %276[3] : !llvm.array<8 x vector<32xf16>>
%320 = llvm.intr.fmuladd(%318, %281, %319) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%321 = llvm.select %221, %320, %319 : vector<32xi1>, vector<32xf16>
%322 = llvm.insertvalue %321, %312[3] : !llvm.array<8 x vector<32xf16>>
%323 = llvm.mul %251, %17 : i64
%324 = llvm.add %323, %275 : i64
%325 = llvm.getelementptr %25[%324] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%326 = llvm.load %325 : !llvm.ptr -> f16
%327 = llvm.insertelement %326, %286[%1 : i32] : vector<32xf16>
%328 = llvm.shufflevector %327, %286 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%329 = llvm.extractvalue %276[4] : !llvm.array<8 x vector<32xf16>>
%330 = llvm.intr.fmuladd(%328, %281, %329) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%331 = llvm.select %221, %330, %329 : vector<32xi1>, vector<32xf16>
%332 = llvm.insertvalue %331, %322[4] : !llvm.array<8 x vector<32xf16>>
%333 = llvm.mul %257, %17 : i64
%334 = llvm.add %333, %275 : i64
%335 = llvm.getelementptr %25[%334] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%336 = llvm.load %335 : !llvm.ptr -> f16
%337 = llvm.insertelement %336, %286[%1 : i32] : vector<32xf16>
%338 = llvm.shufflevector %337, %286 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%339 = llvm.extractvalue %276[5] : !llvm.array<8 x vector<32xf16>>
%340 = llvm.intr.fmuladd(%338, %281, %339) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%341 = llvm.select %221, %340, %339 : vector<32xi1>, vector<32xf16>
%342 = llvm.insertvalue %341, %332[5] : !llvm.array<8 x vector<32xf16>>
%343 = llvm.mul %263, %17 : i64
%344 = llvm.add %343, %275 : i64
%345 = llvm.getelementptr %25[%344] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%346 = llvm.load %345 : !llvm.ptr -> f16
%347 = llvm.insertelement %346, %286[%1 : i32] : vector<32xf16>
%348 = llvm.shufflevector %347, %286 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%349 = llvm.extractvalue %276[6] : !llvm.array<8 x vector<32xf16>>
%350 = llvm.intr.fmuladd(%348, %281, %349) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%351 = llvm.select %221, %350, %349 : vector<32xi1>, vector<32xf16>
%352 = llvm.insertvalue %351, %342[6] : !llvm.array<8 x vector<32xf16>>
%353 = llvm.mul %269, %17 : i64
%354 = llvm.add %353, %275 : i64
%355 = llvm.getelementptr %25[%354] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%356 = llvm.load %355 : !llvm.ptr -> f16
%357 = llvm.insertelement %356, %286[%1 : i32] : vector<32xf16>
%358 = llvm.shufflevector %357, %286 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%359 = llvm.extractvalue %276[7] : !llvm.array<8 x vector<32xf16>>
%360 = llvm.intr.fmuladd(%358, %281, %359) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%361 = llvm.select %221, %360, %359 : vector<32xi1>, vector<32xf16>
%362 = llvm.insertvalue %361, %352[7] : !llvm.array<8 x vector<32xf16>>
%363 = llvm.add %275, %18 : i64
llvm.br ^bb11(%363, %362 : i64, !llvm.array<8 x vector<32xf16>>)
^bb13: // pred: ^bb11
%364 = llvm.extractvalue %276[0] : !llvm.array<8 x vector<32xf16>>
llvm.intr.masked.store %364, %229, %221 {alignment = 2 : i32} : vector<32xf16>, vector<32xi1> into !llvm.ptr
%365 = llvm.extractvalue %276[1] : !llvm.array<8 x vector<32xf16>>
llvm.intr.masked.store %365, %236, %221 {alignment = 2 : i32} : vector<32xf16>, vector<32xi1> into !llvm.ptr
%366 = llvm.extractvalue %276[2] : !llvm.array<8 x vector<32xf16>>
llvm.intr.masked.store %366, %242, %221 {alignment = 2 : i32} : vector<32xf16>, vector<32xi1> into !llvm.ptr
%367 = llvm.extractvalue %276[3] : !llvm.array<8 x vector<32xf16>>
llvm.intr.masked.store %367, %248, %221 {alignment = 2 : i32} : vector<32xf16>, vector<32xi1> into !llvm.ptr
%368 = llvm.extractvalue %276[4] : !llvm.array<8 x vector<32xf16>>
llvm.intr.masked.store %368, %254, %221 {alignment = 2 : i32} : vector<32xf16>, vector<32xi1> into !llvm.ptr
%369 = llvm.extractvalue %276[5] : !llvm.array<8 x vector<32xf16>>
llvm.intr.masked.store %369, %260, %221 {alignment = 2 : i32} : vector<32xf16>, vector<32xi1> into !llvm.ptr
%370 = llvm.extractvalue %276[6] : !llvm.array<8 x vector<32xf16>>
llvm.intr.masked.store %370, %266, %221 {alignment = 2 : i32} : vector<32xf16>, vector<32xi1> into !llvm.ptr
%371 = llvm.extractvalue %276[7] : !llvm.array<8 x vector<32xf16>>
llvm.intr.masked.store %371, %272, %221 {alignment = 2 : i32} : vector<32xf16>, vector<32xi1> into !llvm.ptr
%372 = llvm.add %214, %19 : i64
llvm.br ^bb9(%372 : i64)
^bb14: // pred: ^bb9
%373 = llvm.add %65, %20 : i64
llvm.br ^bb3(%373 : i64)
^bb15(%374: i64): // 2 preds: ^bb3, ^bb19
%375 = llvm.icmp "slt" %374, %57 : i64
llvm.cond_br %375, ^bb16, ^bb20
^bb16: // pred: ^bb15
%376 = llvm.mul %374, %9 : i64
%377 = llvm.add %57, %376 : i64
%378 = llvm.icmp "sgt" %377, %19 : i64
%379 = llvm.select %378, %19, %377 : i1, i64
%380 = llvm.mlir.undef : vector<32xi64>
%381 = llvm.insertelement %379, %380[%1 : i32] : vector<32xi64>
%382 = llvm.shufflevector %381, %380 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xi64>
%383 = llvm.icmp "sgt" %382, %13 : vector<32xi64>
%384 = llvm.mul %47, %8 : i64
%385 = llvm.add %384, %16 : i64
%386 = llvm.mul %45, %21 : i64
%387 = llvm.add %386, %52 : i64
%388 = llvm.add %387, %374 : i64
%389 = llvm.mul %385, %11 : i64
%390 = llvm.add %389, %388 : i64
%391 = llvm.getelementptr %39[%390] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%392 = llvm.intr.masked.load %391, %383, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%393 = llvm.insertvalue %392, %15[0] : !llvm.array<8 x vector<32xf16>>
%394 = llvm.insertvalue %14, %393[1] : !llvm.array<8 x vector<32xf16>>
%395 = llvm.insertvalue %14, %394[2] : !llvm.array<8 x vector<32xf16>>
%396 = llvm.insertvalue %14, %395[3] : !llvm.array<8 x vector<32xf16>>
%397 = llvm.insertvalue %14, %396[4] : !llvm.array<8 x vector<32xf16>>
%398 = llvm.insertvalue %14, %397[5] : !llvm.array<8 x vector<32xf16>>
%399 = llvm.insertvalue %14, %398[6] : !llvm.array<8 x vector<32xf16>>
%400 = llvm.insertvalue %14, %399[7] : !llvm.array<8 x vector<32xf16>>
llvm.br ^bb17(%22, %400 : i64, !llvm.array<8 x vector<32xf16>>)
^bb17(%401: i64, %402: !llvm.array<8 x vector<32xf16>>): // 2 preds: ^bb16, ^bb18
%403 = llvm.icmp "slt" %401, %17 : i64
llvm.cond_br %403, ^bb18, ^bb19
^bb18: // pred: ^bb17
%404 = llvm.mul %385, %17 : i64
%405 = llvm.add %404, %401 : i64
%406 = llvm.getelementptr %25[%405] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%407 = llvm.load %406 {alignment = 2 : i64} : !llvm.ptr -> vector<1xf16>
%408 = llvm.mul %401, %11 : i64
%409 = llvm.add %408, %388 : i64
%410 = llvm.getelementptr %32[%409] : (!llvm.ptr, i64) -> !llvm.ptr, f16
%411 = llvm.intr.masked.load %410, %383, %14 {alignment = 2 : i32} : (!llvm.ptr, vector<32xi1>, vector<32xf16>) -> vector<32xf16>
%412 = llvm.extractelement %407[%0 : i64] : vector<1xf16>
%413 = llvm.mlir.undef : vector<32xf16>
%414 = llvm.insertelement %412, %413[%1 : i32] : vector<32xf16>
%415 = llvm.shufflevector %414, %413 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<32xf16>
%416 = llvm.extractvalue %402[0] : !llvm.array<8 x vector<32xf16>>
%417 = llvm.intr.fmuladd(%415, %411, %416) : (vector<32xf16>, vector<32xf16>, vector<32xf16>) -> vector<32xf16>
%418 = llvm.select %383, %417, %416 : vector<32xi1>, vector<32xf16>
%419 = llvm.insertvalue %418, %15[0] : !llvm.array<8 x vector<32xf16>>
%420 = llvm.extractvalue %402[1] : !llvm.array<8 x vector<32xf16>>
%421 = llvm.insertvalue %420, %419[1] : !llvm.array<8 x vector<32xf16>>
%422 = llvm.extractvalue %402[2] : !llvm.array<8 x vector<32xf16>>
%423 = llvm.insertvalue %422, %421[2] : !llvm.array<8 x vector<32xf16>>
%424 = llvm.extractvalue %402[3] : !llvm.array<8 x vector<32xf16>>
%425 = llvm.insertvalue %424, %423[3] : !llvm.array<8 x vector<32xf16>>
%426 = llvm.extractvalue %402[4] : !llvm.array<8 x vector<32xf16>>
%427 = llvm.insertvalue %426, %425[4] : !llvm.array<8 x vector<32xf16>>
%428 = llvm.extractvalue %402[5] : !llvm.array<8 x vector<32xf16>>
%429 = llvm.insertvalue %428, %427[5] : !llvm.array<8 x vector<32xf16>>
%430 = llvm.extractvalue %402[6] : !llvm.array<8 x vector<32xf16>>
%431 = llvm.insertvalue %430, %429[6] : !llvm.array<8 x vector<32xf16>>
%432 = llvm.extractvalue %402[7] : !llvm.array<8 x vector<32xf16>>
%433 = llvm.insertvalue %432, %431[7] : !llvm.array<8 x vector<32xf16>>
%434 = llvm.add %401, %18 : i64
llvm.br ^bb17(%434, %433 : i64, !llvm.array<8 x vector<32xf16>>)
^bb19: // pred: ^bb17
%435 = llvm.extractvalue %402[0] : !llvm.array<8 x vector<32xf16>>
llvm.intr.masked.store %435, %391, %383 {alignment = 2 : i32} : vector<32xf16>, vector<32xi1> into !llvm.ptr
%436 = llvm.add %374, %19 : i64
llvm.br ^bb15(%436 : i64)
^bb20: // pred: ^bb15
%437 = llvm.add %52, %21 : i64
llvm.br ^bb1(%437 : i64)
^bb21: // pred: ^bb1
llvm.return %1 : i32
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment