-
-
Save Abhishek-Varma/6ea3b626058c8064749a038007d18921 to your computer and use it in GitHub Desktop.
NO Canonicalize | YES CSE
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump Before TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- // | |
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>) { | |
hal.executable.export public @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump Before TypePropagationPass (iree-codegen-type-propagation) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
} | |
// -----// IR Dump Before AMDAIELoweringStrategy (iree-amdaie-lowering-strategy) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
} | |
// -----// IR Dump Before LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
} | |
// -----// IR Dump Before AMDAIELowerExecutableTarget (iree-amdaie-lower-executable-target) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<128x256xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32> | |
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%9 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xbf16>, tensor<256x64xbf16>) outs(%9 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xbf16>, tensor<256x64xbf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xbf16>, tensor<256x64xbf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x256xbf16>, tensor<256x64xbf16>) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEPropagateDataLayout (iree-amdaie-propagate-data-layout) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%9 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%10 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%11 = tensor.empty() : tensor<2x2x32x32xf32> | |
%pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<64x64xf32> -> tensor<2x2x32x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xbf16>, tensor<8x2x32x32xbf16>) outs(%pack_3 : tensor<2x2x32x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_4: bf16, %out: f32): | |
%13 = arith.extf %in : bf16 to f32 | |
%14 = arith.extf %in_4 : bf16 to f32 | |
%15 = arith.mulf %13, %14 : f32 | |
%16 = arith.addf %out, %15 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x2x32x32xf32> | |
%unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> | |
%8 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%9 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%10 = tensor.empty() : tensor<2x2x32x32xf32> | |
%pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x64xf32> -> tensor<2x2x32x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xbf16>, tensor<8x2x32x32xbf16>) outs(%pack_3 : tensor<2x2x32x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_4: bf16, %out: f32): | |
%12 = arith.extf %in : bf16 to f32 | |
%13 = arith.extf %in_4 : bf16 to f32 | |
%14 = arith.mulf %12, %13 : f32 | |
%15 = arith.addf %out, %14 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x2x32x32xf32> | |
%unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%9 = tensor.empty() : tensor<2x2x32x32xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x2x32x32xf32>) -> tensor<2x2x32x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xbf16>, tensor<8x2x32x32xbf16>) outs(%10 : tensor<2x2x32x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_3: bf16, %out: f32): | |
%12 = arith.extf %in : bf16 to f32 | |
%13 = arith.extf %in_3 : bf16 to f32 | |
%14 = arith.mulf %12, %13 : f32 | |
%15 = arith.addf %out, %14 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x2x32x32xf32> | |
%unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%9 = tensor.empty() : tensor<2x2x32x32xf32> | |
%10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x2x32x32xf32>) -> tensor<2x2x32x32xf32> | |
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xbf16>, tensor<8x2x32x32xbf16>) outs(%10 : tensor<2x2x32x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_3: bf16, %out: f32): | |
%12 = arith.extf %in : bf16 to f32 | |
%13 = arith.extf %in_3 : bf16 to f32 | |
%14 = arith.mulf %12, %13 : f32 | |
%15 = arith.addf %out, %14 : f32 | |
linalg.yield %15 : f32 | |
} -> tensor<2x2x32x32xf32> | |
%unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%9 = tensor.empty() : tensor<2x2x32x32xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%10 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x2x32x32xf32>) -> tensor<2x2x32x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xbf16>, tensor<8x2x32x32xbf16>) outs(%11 : tensor<2x2x32x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_3: bf16, %out: f32): | |
%13 = arith.extf %in : bf16 to f32 | |
%14 = arith.extf %in_3 : bf16 to f32 | |
%15 = arith.mulf %13, %14 : f32 | |
%16 = arith.addf %out, %15 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x2x32x32xf32> | |
%unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%9 = tensor.empty() : tensor<2x2x32x32xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%10 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x2x32x32xf32>) -> tensor<2x2x32x32xf32> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<2x8x32x32xbf16>, tensor<8x2x32x32xbf16>) outs(%11 : tensor<2x2x32x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_3: bf16, %out: f32): | |
%13 = arith.extf %in : bf16 to f32 | |
%14 = arith.extf %in_3 : bf16 to f32 | |
%15 = arith.mulf %13, %14 : f32 | |
%16 = arith.addf %out, %15 : f32 | |
linalg.yield %16 : f32 | |
} -> tensor<2x2x32x32xf32> | |
%unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEPropagateDataLayout (iree-amdaie-propagate-data-layout) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%9 = tensor.empty() : tensor<2x2x32x32xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%10 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%11 = linalg.fill ins(%cst : f32) outs(%10 : tensor<2x2x32x32xf32>) -> tensor<2x2x32x32xf32> | |
%12 = tensor.empty() : tensor<2x8x8x4x4x8xbf16> | |
%13 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%14 = tensor.empty() : tensor<8x2x4x8x4x8xbf16> | |
%15 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%16 = tensor.empty() : tensor<2x2x8x8x4x4xf32> | |
%17 = tensor.empty() : tensor<2x2x8x8x4x4xf32> | |
%pack_5 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %17 : tensor<2x2x32x32xf32> -> tensor<2x2x8x8x4x4xf32> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xbf16>, tensor<8x2x8x4x8x4xbf16>) outs(%pack_5 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_7: bf16, %out: f32): | |
%19 = arith.extf %in : bf16 to f32 | |
%20 = arith.extf %in_7 : bf16 to f32 | |
%21 = arith.mulf %19, %20 : f32 | |
%22 = arith.addf %out, %21 : f32 | |
linalg.yield %22 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
%unpack = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x2x32x32xf32>) -> tensor<2x2x32x32xf32> | |
%11 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%12 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%13 = tensor.empty() : tensor<2x2x8x8x4x4xf32> | |
%pack_5 = tensor.pack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %13 : tensor<2x2x32x32xf32> -> tensor<2x2x8x8x4x4xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xbf16>, tensor<8x2x8x4x8x4xbf16>) outs(%pack_5 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_7: bf16, %out: f32): | |
%15 = arith.extf %in : bf16 to f32 | |
%16 = arith.extf %in_7 : bf16 to f32 | |
%17 = arith.mulf %15, %16 : f32 | |
%18 = arith.addf %out, %17 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %10 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%12 = tensor.empty() : tensor<2x2x8x8x4x4xf32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xbf16>, tensor<8x2x8x4x8x4xbf16>) outs(%13 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_6: bf16, %out: f32): | |
%15 = arith.extf %in : bf16 to f32 | |
%16 = arith.extf %in_6 : bf16 to f32 | |
%17 = arith.mulf %15, %16 : f32 | |
%18 = arith.addf %out, %17 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_5 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_5 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%12 = tensor.empty() : tensor<2x2x8x8x4x4xf32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xbf16>, tensor<8x2x8x4x8x4xbf16>) outs(%13 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_6: bf16, %out: f32): | |
%15 = arith.extf %in : bf16 to f32 | |
%16 = arith.extf %in_6 : bf16 to f32 | |
%17 = arith.mulf %15, %16 : f32 | |
%18 = arith.addf %out, %17 : f32 | |
linalg.yield %18 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_5 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_5 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%12 = tensor.empty() : tensor<2x2x8x8x4x4xf32> | |
%alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xbf16>, tensor<8x2x8x4x8x4xbf16>) outs(%14 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_7: bf16, %out: f32): | |
%16 = arith.extf %in : bf16 to f32 | |
%17 = arith.extf %in_7 : bf16 to f32 | |
%18 = arith.mulf %16, %17 : f32 | |
%19 = arith.addf %out, %18 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
%unpack = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%12 = tensor.empty() : tensor<2x2x8x8x4x4xf32> | |
%alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_3, %pack_4 : tensor<2x8x4x8x4x8xbf16>, tensor<8x2x8x4x8x4xbf16>) outs(%14 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_8: bf16, %out: f32): | |
%17 = arith.extf %in : bf16 to f32 | |
%18 = arith.extf %in_8 : bf16 to f32 | |
%19 = arith.mulf %17, %18 : f32 | |
%20 = arith.addf %out, %19 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
%c0_6 = arith.constant 0 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%16 = scf.for %arg3 = %c0_6 to %c8 step %c1 iter_args(%arg4 = %14) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_8 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_9 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%extracted_slice_10 = tensor.extract_slice %arg4[0, 0, 0, 0, 0, 0] [2, 2, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<2x2x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_8, %extracted_slice_9 : tensor<2x1x4x8x4x8xbf16>, tensor<1x2x8x4x8x4xbf16>) outs(%extracted_slice_10 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_11: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_11 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
%inserted_slice = tensor.insert_slice %17 into %arg4[0, 0, 0, 0, 0, 0] [2, 2, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_8 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x1x4x8x4x8xbf16>, tensor<1x2x8x4x8x4xbf16>) outs(%arg4 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_9: bf16, %out: f32): | |
%16 = arith.extf %in : bf16 to f32 | |
%17 = arith.extf %in_9 : bf16 to f32 | |
%18 = arith.mulf %16, %17 : f32 | |
%19 = arith.addf %out, %18 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
scf.yield %15 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_8 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x1x4x8x4x8xbf16>, tensor<1x2x8x4x8x4xbf16>) outs(%arg4 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_9: bf16, %out: f32): | |
%16 = arith.extf %in : bf16 to f32 | |
%17 = arith.extf %in_9 : bf16 to f32 | |
%18 = arith.mulf %16, %17 : f32 | |
%19 = arith.addf %out, %18 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
scf.yield %15 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEFusePackIntoLoop (iree-amdaie-fuse-pack-into-loop) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_7 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_8 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_7, %extracted_slice_8 : tensor<2x1x4x8x4x8xbf16>, tensor<1x2x8x4x8x4xbf16>) outs(%arg4 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_9: bf16, %out: f32): | |
%16 = arith.extf %in : bf16 to f32 | |
%17 = arith.extf %in_9 : bf16 to f32 | |
%18 = arith.mulf %16, %17 : f32 | |
%19 = arith.addf %out, %18 : f32 | |
linalg.yield %19 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
scf.yield %15 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<64x256xbf16> -> tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x64xbf16> -> tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%pack_3 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x8x32x32xbf16> -> tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%pack_4 = tensor.pack %pack_2 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<8x2x32x32xbf16> -> tensor<8x2x8x4x8x4xbf16> | |
%alloc_5 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%12 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%extracted_slice_8 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xbf16> to tensor<2x1x32x32xbf16> | |
%pack_9 = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_8 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_10 = tensor.extract_slice %pack[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xbf16> to tensor<2x1x32x32xbf16> | |
%extracted_slice_11 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%pack_12 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<2x1x32x32xbf16> -> tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_13 = tensor.extract_slice %pack_3[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_14 = tensor.extract_slice %extracted_slice_0[%16, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%extracted_slice_15 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xbf16> to tensor<1x2x32x32xbf16> | |
%pack_16 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_2[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xbf16> to tensor<1x2x32x32xbf16> | |
%extracted_slice_18 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%pack_19 = tensor.pack %pack_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_18 : tensor<1x2x32x32xbf16> -> tensor<1x2x8x4x8x4xbf16> | |
%extracted_slice_20 = tensor.extract_slice %pack_4[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_19 : tensor<2x1x4x8x4x8xbf16>, tensor<1x2x8x4x8x4xbf16>) outs(%arg4 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_21: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_21 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_6 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_5 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_6 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xbf16> to tensor<2x1x32x32xbf16> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_6 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%pack_7 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<2x1x32x32xbf16> -> tensor<2x1x4x8x4x8xbf16> | |
%16 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_0[%16, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%extracted_slice_9 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xbf16> to tensor<1x2x32x32xbf16> | |
%pack_10 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_9 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%extracted_slice_11 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%pack_12 = tensor.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x2x32x32xbf16> -> tensor<1x2x8x4x8x4xbf16> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_7, %pack_12 : tensor<2x1x4x8x4x8xbf16>, tensor<1x2x8x4x8x4xbf16>) outs(%arg4 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_13: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_13 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xbf16> to tensor<2x1x32x32xbf16> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_5 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_6 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%pack_7 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_6 : tensor<2x1x32x32xbf16> -> tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_0[%15, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%extracted_slice_9 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xbf16> to tensor<1x2x32x32xbf16> | |
%pack_10 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_9 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%extracted_slice_11 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%pack_12 = tensor.pack %pack_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_11 : tensor<1x2x32x32xbf16> -> tensor<1x2x8x4x8x4xbf16> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_7, %pack_12 : tensor<2x1x4x8x4x8xbf16>, tensor<1x2x8x4x8x4xbf16>) outs(%arg4 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_13: bf16, %out: f32): | |
%17 = arith.extf %in : bf16 to f32 | |
%18 = arith.extf %in_13 : bf16 to f32 | |
%19 = arith.mulf %17, %18 : f32 | |
%20 = arith.addf %out, %19 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xbf16> to tensor<2x1x32x32xbf16> | |
%alloc_6 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%16 = bufferization.to_tensor %alloc_6 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_7 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%pack_8 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_7 : tensor<2x1x32x32xbf16> -> tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_0[%15, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%extracted_slice_10 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xbf16> to tensor<1x2x32x32xbf16> | |
%alloc_11 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%17 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%extracted_slice_13 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%pack_14 = tensor.pack %pack_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x2x32x32xbf16> -> tensor<1x2x8x4x8x4xbf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_14 : tensor<2x1x4x8x4x8xbf16>, tensor<1x2x8x4x8x4xbf16>) outs(%arg4 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_15: bf16, %out: f32): | |
%19 = arith.extf %in : bf16 to f32 | |
%20 = arith.extf %in_15 : bf16 to f32 | |
%21 = arith.mulf %19, %20 : f32 | |
%22 = arith.addf %out, %21 : f32 | |
linalg.yield %22 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
memref.dealloc %alloc_6 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_11 : memref<1x2x32x32xbf16, 1 : i32> | |
scf.yield %18 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = tensor.empty() : tensor<2x8x32x32xbf16> | |
%8 = tensor.empty() : tensor<8x2x32x32xbf16> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%9 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%10 = tensor.empty() : tensor<2x8x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<8x2x8x4x8x4xbf16> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%13 = linalg.fill ins(%cst : f32) outs(%12 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%14 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%15 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %15] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%extracted_slice_5 = tensor.extract_slice %7[0, %arg3, 0, 0] [2, 1, 32, 32] [1, 1, 1, 1] : tensor<2x8x32x32xbf16> to tensor<2x1x32x32xbf16> | |
%alloc_6 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%16 = bufferization.to_tensor %alloc_6 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_7 = tensor.extract_slice %10[0, %arg3, 0, 0, 0, 0] [2, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x8x4x8x4x8xbf16> to tensor<2x1x4x8x4x8xbf16> | |
%pack_8 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_7 : tensor<2x1x32x32xbf16> -> tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_0[%15, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%extracted_slice_10 = tensor.extract_slice %8[%arg3, 0, 0, 0] [1, 2, 32, 32] [1, 1, 1, 1] : tensor<8x2x32x32xbf16> to tensor<1x2x32x32xbf16> | |
%alloc_11 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%17 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%extracted_slice_13 = tensor.extract_slice %11[%arg3, 0, 0, 0, 0, 0] [1, 2, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<8x2x8x4x8x4xbf16> to tensor<1x2x8x4x8x4xbf16> | |
%pack_14 = tensor.pack %pack_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x2x32x32xbf16> -> tensor<1x2x8x4x8x4xbf16> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_14 : tensor<2x1x4x8x4x8xbf16>, tensor<1x2x8x4x8x4xbf16>) outs(%arg4 : tensor<2x2x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_15: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_15 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<2x2x8x8x4x4xf32> | |
%19 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack_8[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_14[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_17 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_15, %extracted_slice_16 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_17 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_18: bf16, %out: f32): | |
%21 = arith.extf %in : bf16 to f32 | |
%22 = arith.extf %in_18 : bf16 to f32 | |
%23 = arith.mulf %21, %22 : f32 | |
%24 = arith.addf %out, %23 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
memref.dealloc %alloc_6 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_11 : memref<1x2x32x32xbf16, 1 : i32> | |
scf.yield %19 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %9 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = tensor.empty() : tensor<2x1x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<1x2x8x4x8x4xbf16> | |
%12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x32x32xbf16> -> tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%alloc_8 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x32x32xbf16> -> tensor<1x2x8x4x8x4xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_11 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_12 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_13 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_13 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_14: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_14 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_8 : memref<1x2x32x32xbf16, 1 : i32> | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = tensor.empty() : tensor<2x1x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<1x2x8x4x8x4xbf16> | |
%12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x32x32xbf16> -> tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%alloc_8 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x32x32xbf16> -> tensor<1x2x8x4x8x4xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_11 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_12 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_13 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_13 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_14: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_14 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_8 : memref<1x2x32x32xbf16, 1 : i32> | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEFusePackIntoLoop (iree-amdaie-fuse-pack-into-loop) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = tensor.empty() : tensor<2x1x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<1x2x8x4x8x4xbf16> | |
%12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x32x32xbf16> -> tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%alloc_8 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x32x32xbf16> -> tensor<1x2x8x4x8x4xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_11 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_12 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_13 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_11, %extracted_slice_12 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_13 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_14: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_14 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_8 : memref<1x2x32x32xbf16, 1 : i32> | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = tensor.empty() : tensor<2x1x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<1x2x8x4x8x4xbf16> | |
%12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%pack_6 = tensor.pack %pack outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %10 : tensor<2x1x32x32xbf16> -> tensor<2x1x4x8x4x8xbf16> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%alloc_8 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%15 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_7 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%pack_10 = tensor.pack %pack_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %11 : tensor<1x2x32x32xbf16> -> tensor<1x2x8x4x8x4xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%pack_13 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_12 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_14 = tensor.extract_slice %pack_6[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_15 = tensor.extract_slice %pack_9[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_16 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%pack_17 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_16 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %pack_10[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_20: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_20 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_8 : memref<1x2x32x32xbf16, 1 : i32> | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = tensor.empty() : tensor<2x1x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<1x2x8x4x8x4xbf16> | |
%12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%alloc_7 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%15 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_9 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_10 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%pack_11 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_10 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_12 = tensor.extract_slice %pack_8[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_13 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_11, %pack_14 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_15 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_16: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_16 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_7 : memref<1x2x32x32xbf16, 1 : i32> | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = tensor.empty() : tensor<2x1x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<1x2x8x4x8x4xbf16> | |
%12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%alloc_7 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%15 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_9 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_10 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%pack_11 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_10 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_12 = tensor.extract_slice %pack_8[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_13 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_13 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_11, %pack_14 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_15 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_16: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_16 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_7 : memref<1x2x32x32xbf16, 1 : i32> | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%alloc = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%7 = bufferization.to_tensor %alloc restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = tensor.empty() : tensor<2x1x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<1x2x8x4x8x4xbf16> | |
%12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_4 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_4 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_6 = tensor.extract_slice %extracted_slice_0[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%alloc_7 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%15 = bufferization.to_tensor %alloc_7 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_8 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_9 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_10 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%17 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_12 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_13 = tensor.extract_slice %pack_8[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_14 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%alloc_15 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%18 = bufferization.to_tensor %alloc_15 restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_17 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_16 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_17 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_18: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_18 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_15 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_7 : memref<1x2x32x32xbf16, 1 : i32> | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_3 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_1 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
memref.dealloc %alloc : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_2 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_3 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = tensor.empty() : tensor<2x1x4x8x4x8xbf16> | |
%11 = tensor.empty() : tensor<1x2x8x4x8x4xbf16> | |
%12 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %10[%arg5, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x8xbf16> to tensor<1x1x4x8x4x8xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_13 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_14 = tensor.extract_slice %pack_10[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%extracted_slice_15 = tensor.extract_slice %11[0, %arg6, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x2x8x4x8x4xbf16> to tensor<1x1x8x4x8x4xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_17 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_16 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_17 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_18: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_18 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %11] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %12 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%11, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%13 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%14 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_12 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_13 = tensor.extract_slice %pack_10[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%16 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_14 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_14 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_15 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_16: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_16 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %14 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before AMDAIEPeelForLoop (iree-amdaie-peel-for-loop) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%10 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice[0, %11] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %12 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_5[%11, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%13 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %13 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%14 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_11 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_12 = tensor.pack %extracted_slice_11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_13 = tensor.extract_slice %pack_10[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%16 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_14 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_15 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_14 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_15 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_16: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_16 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %14 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_7 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_7 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%c1_7 = arith.constant 1 : index | |
%10 = scf.for %arg3 = %c0 to %c1_7 step %c1 iter_args(%arg4 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_11 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_12 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_14 = tensor.extract_slice %pack_11[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_16 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_17: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_17 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%c7 = arith.constant 7 : index | |
%11 = scf.for %arg3 = %c1_7 to %c7 step %c1 iter_args(%arg4 = %10) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_11 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_12 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_14 = tensor.extract_slice %pack_11[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_16 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_17: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_17 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%12 = scf.for %arg3 = %c7 to %c8 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice[0, %13] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_9 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[%13, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_11 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_12 = tensor.extract_slice %pack[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_14 = tensor.extract_slice %pack_11[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_16 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_15 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_17: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_17 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %19 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %16 : tensor<2x2x8x8x4x4xf32> | |
} | |
%unpack = tensor.unpack %12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_8 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before AMDAIEFuseFillIntoForall (iree-amdaie-fuse-fill-into-forall) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %9) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_20: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_20 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%13 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x8x8x4x4xf32>) { | |
%17 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_15 = tensor.extract_slice %extracted_slice[0, %17] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_17 = tensor.extract_slice %extracted_slice_5[%17, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %19 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_19 = tensor.extract_slice %pack_16[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_20 = tensor.pack %extracted_slice_19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_21 = tensor.extract_slice %pack_18[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_22 = tensor.pack %extracted_slice_21 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_23 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_20, %pack_22 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_23 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_24: bf16, %out: f32): | |
%24 = arith.extf %in : bf16 to f32 | |
%25 = arith.extf %in_24 : bf16 to f32 | |
%26 = arith.mulf %24, %25 : f32 | |
%27 = arith.addf %out, %26 : f32 | |
linalg.yield %27 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %20 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_20: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_20 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_14 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_14 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before AMDAIEFuseConsumerIntoLoop (iree-amdaie-fuse-consumer-into-loop) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.fill ins(%cst : f32) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%extracted_slice_20 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%19 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_21: bf16, %out: f32): | |
%21 = arith.extf %in : bf16 to f32 | |
%22 = arith.extf %in_21 : bf16 to f32 | |
%23 = arith.mulf %21, %22 : f32 | |
%24 = arith.addf %out, %23 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%13 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x8x8x4x4xf32>) { | |
%17 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_15 = tensor.extract_slice %extracted_slice[0, %17] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_17 = tensor.extract_slice %extracted_slice_5[%17, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %19 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_19 = tensor.extract_slice %pack_16[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_20 = tensor.pack %extracted_slice_19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_21 = tensor.extract_slice %pack_18[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_22 = tensor.pack %extracted_slice_21 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_23 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_20, %pack_22 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_23 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_24: bf16, %out: f32): | |
%24 = arith.extf %in : bf16 to f32 | |
%25 = arith.extf %in_24 : bf16 to f32 | |
%26 = arith.mulf %24, %25 : f32 | |
%27 = arith.addf %out, %26 : f32 | |
linalg.yield %27 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %20 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_16, %pack_18 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_20: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_20 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %7 : tensor<2x2x8x8x4x4xf32> -> tensor<2x2x32x32xf32> | |
%unpack_14 = tensor.unpack %unpack inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_14 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<2x2x8x8x4x4xf32>) -> tensor<2x2x8x8x4x4xf32> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%10 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%11 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %11 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%12 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.fill ins(%cst : f32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%19 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_20: bf16, %out: f32): | |
%21 = arith.extf %in : bf16 to f32 | |
%22 = arith.extf %in_20 : bf16 to f32 | |
%23 = arith.mulf %21, %22 : f32 | |
%24 = arith.addf %out, %23 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%13 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %12) -> (tensor<2x2x8x8x4x4xf32>) { | |
%17 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %17] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%18 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %18 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%17, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%19 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %19 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%20 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%21 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %21 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%22 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %22 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_23: bf16, %out: f32): | |
%24 = arith.extf %in : bf16 to f32 | |
%25 = arith.extf %in_23 : bf16 to f32 | |
%26 = arith.mulf %24, %25 : f32 | |
%27 = arith.addf %out, %26 : f32 | |
linalg.yield %27 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %23 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %20 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%14 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%15 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%16:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %13, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%17 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_22: bf16, %out: f32): | |
%20 = arith.extf %in : bf16 to f32 | |
%21 = arith.extf %in_22 : bf16 to f32 | |
%22 = arith.mulf %20, %21 : f32 | |
%23 = arith.addf %out, %22 : f32 | |
linalg.yield %23 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
%inserted_slice = tensor.insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
%extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%extracted_slice_20 = tensor.extract_slice %inserted_slice[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%unpack_21 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %19 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %16#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before AMDAIEFusePackIntoLoop (iree-amdaie-fuse-pack-into-loop) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.fill ins(%cst : f32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%16 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_19: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_19 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%14 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %14] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%15 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%14, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%16 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%17 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_23: bf16, %out: f32): | |
%21 = arith.extf %in : bf16 to f32 | |
%22 = arith.extf %in_23 : bf16 to f32 | |
%23 = arith.mulf %21, %22 : f32 | |
%24 = arith.addf %out, %23 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%13:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_21: bf16, %out: f32): | |
%17 = arith.extf %in : bf16 to f32 | |
%18 = arith.extf %in_21 : bf16 to f32 | |
%19 = arith.mulf %17, %18 : f32 | |
%20 = arith.addf %out, %19 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
%inserted_slice = tensor.insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
%extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%unpack_20 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %13#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.fill ins(%cst : f32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%16 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_19: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_19 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%14 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %14] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%15 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%14, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%16 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%17 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_23: bf16, %out: f32): | |
%21 = arith.extf %in : bf16 to f32 | |
%22 = arith.extf %in_23 : bf16 to f32 | |
%23 = arith.mulf %21, %22 : f32 | |
%24 = arith.addf %out, %23 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%13:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_21: bf16, %out: f32): | |
%17 = arith.extf %in : bf16 to f32 | |
%18 = arith.extf %in_21 : bf16 to f32 | |
%19 = arith.mulf %17, %18 : f32 | |
%20 = arith.addf %out, %19 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
%inserted_slice = tensor.insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
%extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%unpack_20 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %13#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before AMDAIELowerToUKernels (iree-amdaie-lower-to-ukernels) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.fill ins(%cst : f32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%16 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_19: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_19 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%14 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %14] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%15 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%14, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%16 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%17 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_23: bf16, %out: f32): | |
%21 = arith.extf %in : bf16 to f32 | |
%22 = arith.extf %in_23 : bf16 to f32 | |
%23 = arith.mulf %21, %22 : f32 | |
%24 = arith.addf %out, %23 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%13:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_21: bf16, %out: f32): | |
%17 = arith.extf %in : bf16 to f32 | |
%18 = arith.extf %in_21 : bf16 to f32 | |
%19 = arith.mulf %17, %18 : f32 | |
%20 = arith.addf %out, %19 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
%inserted_slice = tensor.insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
%extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%unpack_20 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %13#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.fill ins(%cst : f32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%16 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_19: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_19 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%14 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %14] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%15 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%14, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%16 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%17 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_23: bf16, %out: f32): | |
%21 = arith.extf %in : bf16 to f32 | |
%22 = arith.extf %in_23 : bf16 to f32 | |
%23 = arith.mulf %21, %22 : f32 | |
%24 = arith.addf %out, %23 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%13:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_21: bf16, %out: f32): | |
%17 = arith.extf %in : bf16 to f32 | |
%18 = arith.extf %in_21 : bf16 to f32 | |
%19 = arith.mulf %17, %18 : f32 | |
%20 = arith.addf %out, %19 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
%extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%unpack_20 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %13#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before AMDAIEInsertLoopsForVectorization (iree-amdaie-insert-loops-for-vectorization) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_0 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_5 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_6 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%extracted_slice_7 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%9 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice_5[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%10 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_9 = tensor.pack %extracted_slice_8 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_9[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.fill ins(%cst : f32) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%16 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_19: bf16, %out: f32): | |
%18 = arith.extf %in : bf16 to f32 | |
%19 = arith.extf %in_19 : bf16 to f32 | |
%20 = arith.mulf %18, %19 : f32 | |
%21 = arith.addf %out, %20 : f32 | |
linalg.yield %21 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%14 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_14 = tensor.extract_slice %extracted_slice[0, %14] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%15 = bufferization.to_tensor %alloc_2 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_16 = tensor.extract_slice %extracted_slice_5[%14, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%16 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%17 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_18 = tensor.extract_slice %pack_15[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_19 = tensor.pack %extracted_slice_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_20 = tensor.extract_slice %pack_17[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_21 = tensor.pack %extracted_slice_20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_22 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_19, %pack_21 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_22 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_23: bf16, %out: f32): | |
%21 = arith.extf %in : bf16 to f32 | |
%22 = arith.extf %in_23 : bf16 to f32 | |
%23 = arith.mulf %21, %22 : f32 | |
%24 = arith.addf %out, %23 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_10 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%pack_11 = tensor.pack %extracted_slice_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%pack_13 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%13:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_14 = tensor.extract_slice %pack_11[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_0 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_15 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_16 = tensor.extract_slice %pack_13[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_17 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_18 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_15, %pack_17 : tensor<1x1x4x8x4x8xbf16>, tensor<1x1x8x4x8x4xbf16>) outs(%extracted_slice_18 : tensor<1x1x8x8x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_21: bf16, %out: f32): | |
%17 = arith.extf %in : bf16 to f32 | |
%18 = arith.extf %in_21 : bf16 to f32 | |
%19 = arith.mulf %17, %18 : f32 | |
%20 = arith.addf %out, %19 : f32 | |
linalg.yield %20 : f32 | |
} -> tensor<1x1x8x8x4x4xf32> | |
%extracted_slice_19 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%unpack_20 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_19 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_20 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %13#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_6 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_4 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_0 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before AMDAIEVectorization (iree-amdaie-vectorization) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c1 = arith.constant 1 : index | |
%c1_0 = arith.constant 1 : index | |
%c1_1 = arith.constant 1 : index | |
%c1_2 = arith.constant 1 : index | |
%c1_3 = arith.constant 1 : index | |
%c1_4 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c0_5 = arith.constant 0 : index | |
%c0_6 = arith.constant 0 : index | |
%c1_7 = arith.constant 1 : index | |
%c1_8 = arith.constant 1 : index | |
%c1_9 = arith.constant 1 : index | |
%c1_10 = arith.constant 1 : index | |
%c1_11 = arith.constant 1 : index | |
%c1_12 = arith.constant 1 : index | |
%c0_13 = arith.constant 0 : index | |
%c0_14 = arith.constant 0 : index | |
%c0_15 = arith.constant 0 : index | |
%c1_16 = arith.constant 1 : index | |
%c1_17 = arith.constant 1 : index | |
%c1_18 = arith.constant 1 : index | |
%c1_19 = arith.constant 1 : index | |
%c1_20 = arith.constant 1 : index | |
%c1_21 = arith.constant 1 : index | |
%c0_22 = arith.constant 0 : index | |
%c0_23 = arith.constant 0 : index | |
%c0_24 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%c0_25 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c1_26 = arith.constant 1 : index | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_27 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_28 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_29 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_30 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_31 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0_25) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0_25) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0_25) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_32 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_33 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_31 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_30 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%extracted_slice_34 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%9 = bufferization.to_tensor %alloc_29 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_34 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_35 = tensor.extract_slice %extracted_slice_32[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%10 = bufferization.to_tensor %alloc_28 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_36 = tensor.pack %extracted_slice_35 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_41 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_27 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_42 = tensor.pack %extracted_slice_41 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_43 = tensor.extract_slice %pack_36[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_44 = tensor.pack %extracted_slice_43 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_45 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.fill ins(%cst : f32) outs(%extracted_slice_45 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%c0_46 = arith.constant 0 : index | |
%c1_47 = arith.constant 1 : index | |
%c1_48 = arith.constant 1 : index | |
%c0_49 = arith.constant 0 : index | |
%c1_50 = arith.constant 1 : index | |
%c1_51 = arith.constant 1 : index | |
%c0_52 = arith.constant 0 : index | |
%c1_53 = arith.constant 1 : index | |
%c1_54 = arith.constant 1 : index | |
%c0_55 = arith.constant 0 : index | |
%c8 = arith.constant 8 : index | |
%c1_56 = arith.constant 1 : index | |
%c0_57 = arith.constant 0 : index | |
%c8_58 = arith.constant 8 : index | |
%c1_59 = arith.constant 1 : index | |
%c0_60 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c1_61 = arith.constant 1 : index | |
%17 = scf.for %arg6 = %c0_46 to %c1_47 step %c1_48 iter_args(%arg7 = %16) -> (tensor<1x1x8x8x4x4xf32>) { | |
%18 = scf.for %arg8 = %c0_49 to %c1_50 step %c1_51 iter_args(%arg9 = %arg7) -> (tensor<1x1x8x8x4x4xf32>) { | |
%19 = scf.for %arg10 = %c0_52 to %c1_53 step %c1_54 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xf32>) { | |
%20 = scf.for %arg12 = %c0_55 to %c8 step %c1_56 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg14 = %c0_57 to %c8_58 step %c1_59 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xf32>) { | |
%22 = scf.for %arg16 = %c0_60 to %c4 step %c1_61 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_62 = tensor.extract_slice %pack_42[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_63 = tensor.extract_slice %pack_44[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_64 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_62, %extracted_slice_63 : tensor<1x1x1x1x4x8xbf16>, tensor<1x1x1x1x8x4xbf16>) outs(%extracted_slice_64 : tensor<1x1x1x1x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_65: bf16, %out: f32): | |
%24 = arith.extf %in : bf16 to f32 | |
%25 = arith.extf %in_65 : bf16 to f32 | |
%26 = arith.mulf %24, %25 : f32 | |
%27 = arith.addf %out, %26 : f32 | |
linalg.yield %27 : f32 | |
} -> tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %23 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %22 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %20 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %19 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %18 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%12 = scf.for %arg3 = %c1_26 to %c7 step %c1_26 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%14 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_41 = tensor.extract_slice %extracted_slice[0, %14] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%15 = bufferization.to_tensor %alloc_29 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_42 = tensor.pack %extracted_slice_41 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_43 = tensor.extract_slice %extracted_slice_32[%14, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%16 = bufferization.to_tensor %alloc_28 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_44 = tensor.pack %extracted_slice_43 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%17 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_45 = tensor.extract_slice %pack_42[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc_27 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_46 = tensor.pack %extracted_slice_45 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_47 = tensor.extract_slice %pack_44[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_48 = tensor.pack %extracted_slice_47 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_49 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%c0_50 = arith.constant 0 : index | |
%c1_51 = arith.constant 1 : index | |
%c1_52 = arith.constant 1 : index | |
%c0_53 = arith.constant 0 : index | |
%c1_54 = arith.constant 1 : index | |
%c1_55 = arith.constant 1 : index | |
%c0_56 = arith.constant 0 : index | |
%c1_57 = arith.constant 1 : index | |
%c1_58 = arith.constant 1 : index | |
%c0_59 = arith.constant 0 : index | |
%c8 = arith.constant 8 : index | |
%c1_60 = arith.constant 1 : index | |
%c0_61 = arith.constant 0 : index | |
%c8_62 = arith.constant 8 : index | |
%c1_63 = arith.constant 1 : index | |
%c0_64 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c1_65 = arith.constant 1 : index | |
%20 = scf.for %arg8 = %c0_50 to %c1_51 step %c1_52 iter_args(%arg9 = %extracted_slice_49) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg10 = %c0_53 to %c1_54 step %c1_55 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xf32>) { | |
%22 = scf.for %arg12 = %c0_56 to %c1_57 step %c1_58 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xf32>) { | |
%23 = scf.for %arg14 = %c0_59 to %c8 step %c1_60 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xf32>) { | |
%24 = scf.for %arg16 = %c0_61 to %c8_62 step %c1_63 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xf32>) { | |
%25 = scf.for %arg18 = %c0_64 to %c4 step %c1_65 iter_args(%arg19 = %arg17) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_66 = tensor.extract_slice %pack_46[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_67 = tensor.extract_slice %pack_48[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_68 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_66, %extracted_slice_67 : tensor<1x1x1x1x4x8xbf16>, tensor<1x1x1x1x8x4xbf16>) outs(%extracted_slice_68 : tensor<1x1x1x1x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_69: bf16, %out: f32): | |
%27 = arith.extf %in : bf16 to f32 | |
%28 = arith.extf %in_69 : bf16 to f32 | |
%29 = arith.mulf %27, %28 : f32 | |
%30 = arith.addf %out, %29 : f32 | |
linalg.yield %30 : f32 | |
} -> tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %26 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %25 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %24 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %23 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %22 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_37 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%pack_38 = tensor.pack %extracted_slice_37 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_39 = tensor.extract_slice %extracted_slice_32[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%pack_40 = tensor.pack %extracted_slice_39 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%13:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_41 = tensor.extract_slice %pack_38[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_27 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_42 = tensor.pack %extracted_slice_41 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_43 = tensor.extract_slice %pack_40[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_44 = tensor.pack %extracted_slice_43 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_45 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%c0_46 = arith.constant 0 : index | |
%c1_47 = arith.constant 1 : index | |
%c1_48 = arith.constant 1 : index | |
%c0_49 = arith.constant 0 : index | |
%c1_50 = arith.constant 1 : index | |
%c1_51 = arith.constant 1 : index | |
%c0_52 = arith.constant 0 : index | |
%c1_53 = arith.constant 1 : index | |
%c1_54 = arith.constant 1 : index | |
%c0_55 = arith.constant 0 : index | |
%c8 = arith.constant 8 : index | |
%c1_56 = arith.constant 1 : index | |
%c0_57 = arith.constant 0 : index | |
%c8_58 = arith.constant 8 : index | |
%c1_59 = arith.constant 1 : index | |
%c0_60 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c1_61 = arith.constant 1 : index | |
%16 = scf.for %arg7 = %c0_46 to %c1_47 step %c1_48 iter_args(%arg8 = %extracted_slice_45) -> (tensor<1x1x8x8x4x4xf32>) { | |
%17 = scf.for %arg9 = %c0_49 to %c1_50 step %c1_51 iter_args(%arg10 = %arg8) -> (tensor<1x1x8x8x4x4xf32>) { | |
%18 = scf.for %arg11 = %c0_52 to %c1_53 step %c1_54 iter_args(%arg12 = %arg10) -> (tensor<1x1x8x8x4x4xf32>) { | |
%19 = scf.for %arg13 = %c0_55 to %c8 step %c1_56 iter_args(%arg14 = %arg12) -> (tensor<1x1x8x8x4x4xf32>) { | |
%20 = scf.for %arg15 = %c0_57 to %c8_58 step %c1_59 iter_args(%arg16 = %arg14) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg17 = %c0_60 to %c4 step %c1_61 iter_args(%arg18 = %arg16) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_64 = tensor.extract_slice %pack_42[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_65 = tensor.extract_slice %pack_44[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_66 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_64, %extracted_slice_65 : tensor<1x1x1x1x4x8xbf16>, tensor<1x1x1x1x8x4xbf16>) outs(%extracted_slice_66 : tensor<1x1x1x1x4x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} { | |
^bb0(%in: bf16, %in_67: bf16, %out: f32): | |
%23 = arith.extf %in : bf16 to f32 | |
%24 = arith.extf %in_67 : bf16 to f32 | |
%25 = arith.mulf %23, %24 : f32 | |
%26 = arith.addf %out, %25 : f32 | |
linalg.yield %26 : f32 | |
} -> tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %22 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %20 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %19 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %18 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %17 : tensor<1x1x8x8x4x4xf32> | |
} | |
%extracted_slice_62 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%unpack_63 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_62 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_63 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %13#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_33 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_31 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_30 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_29 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_28 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_27 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_2 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_5 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = tensor.empty() : tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_6 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_7 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%9 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_6[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%10 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_10[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%17 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %16) -> (tensor<1x1x8x8x4x4xf32>) { | |
%18 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x8x8x4x4xf32>) { | |
%19 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xf32>) { | |
%20 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xf32>) { | |
%22 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_20 = tensor.extract_slice %pack_16[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_21 = tensor.extract_slice %pack_18[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_22 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%23 = vector.transfer_read %extracted_slice_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16> | |
%24 = vector.transfer_read %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16> | |
%25 = vector.transfer_read %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xf32>, vector<1x1x1x1x4x4xf32> | |
%26 = arith.extf %23 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%27 = arith.extf %24 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %26, %27, %25 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
%29 = vector.transfer_write %28, %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %29 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %22 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %20 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %19 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %18 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%14 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_15 = tensor.extract_slice %extracted_slice[0, %14] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%15 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_17 = tensor.extract_slice %extracted_slice_6[%14, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%16 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%17 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_19 = tensor.extract_slice %pack_16[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_20 = tensor.pack %extracted_slice_19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_21 = tensor.extract_slice %pack_18[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_22 = tensor.pack %extracted_slice_21 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_23 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %extracted_slice_23) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xf32>) { | |
%22 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xf32>) { | |
%23 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xf32>) { | |
%24 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xf32>) { | |
%25 = scf.for %arg18 = %c0 to %c4 step %c1 iter_args(%arg19 = %arg17) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_24 = tensor.extract_slice %pack_20[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_25 = tensor.extract_slice %pack_22[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_26 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%26 = vector.transfer_read %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16> | |
%27 = vector.transfer_read %extracted_slice_25[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16> | |
%28 = vector.transfer_read %extracted_slice_26[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xf32>, vector<1x1x1x1x4x4xf32> | |
%29 = arith.extf %26 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%30 = arith.extf %27 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %29, %30, %28 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
%32 = vector.transfer_write %31, %extracted_slice_26[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %32 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %25 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %24 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %23 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %22 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_11 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%pack_12 = tensor.pack %extracted_slice_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%pack_14 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%13:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack_12[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_14[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = scf.for %arg7 = %c0 to %c1 step %c1 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1x8x8x4x4xf32>) { | |
%17 = scf.for %arg9 = %c0 to %c1 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x8x8x4x4xf32>) { | |
%18 = scf.for %arg11 = %c0 to %c1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x1x8x8x4x4xf32>) { | |
%19 = scf.for %arg13 = %c0 to %c8 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x8x8x4x4xf32>) { | |
%20 = scf.for %arg15 = %c0 to %c8 step %c1 iter_args(%arg16 = %arg14) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg17 = %c0 to %c4 step %c1 iter_args(%arg18 = %arg16) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_22 = tensor.extract_slice %pack_16[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_23 = tensor.extract_slice %pack_18[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_24 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%22 = vector.transfer_read %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16> | |
%23 = vector.transfer_read %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16> | |
%24 = vector.transfer_read %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xf32>, vector<1x1x1x1x4x4xf32> | |
%25 = arith.extf %22 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%26 = arith.extf %23 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%27 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %25, %26, %24 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
%28 = vector.transfer_write %27, %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %28 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %20 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %19 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %18 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %17 : tensor<1x1x8x8x4x4xf32> | |
} | |
%extracted_slice_20 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%unpack_21 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_20 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %13#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_7 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_5 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_4 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_3 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_2 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_2 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_5 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> -> tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_6 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_7 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%9 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_6[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%10 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_10[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%17 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %16) -> (tensor<1x1x8x8x4x4xf32>) { | |
%18 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x8x8x4x4xf32>) { | |
%19 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xf32>) { | |
%20 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xf32>) { | |
%22 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_20 = tensor.extract_slice %pack_16[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_21 = tensor.extract_slice %pack_18[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_22 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%23 = vector.transfer_read %extracted_slice_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16> | |
%24 = vector.transfer_read %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16> | |
%25 = vector.transfer_read %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xf32>, vector<1x1x1x1x4x4xf32> | |
%26 = arith.extf %23 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%27 = arith.extf %24 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %26, %27, %25 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
%29 = vector.transfer_write %28, %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %29 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %22 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %20 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %19 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %18 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%14 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_15 = tensor.extract_slice %extracted_slice[0, %14] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%15 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_17 = tensor.extract_slice %extracted_slice_6[%14, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%16 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%17 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_19 = tensor.extract_slice %pack_16[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_20 = tensor.pack %extracted_slice_19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_21 = tensor.extract_slice %pack_18[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_22 = tensor.pack %extracted_slice_21 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_23 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %extracted_slice_23) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xf32>) { | |
%22 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xf32>) { | |
%23 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xf32>) { | |
%24 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xf32>) { | |
%25 = scf.for %arg18 = %c0 to %c4 step %c1 iter_args(%arg19 = %arg17) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_24 = tensor.extract_slice %pack_20[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_25 = tensor.extract_slice %pack_22[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_26 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%26 = vector.transfer_read %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16> | |
%27 = vector.transfer_read %extracted_slice_25[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16> | |
%28 = vector.transfer_read %extracted_slice_26[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xf32>, vector<1x1x1x1x4x4xf32> | |
%29 = arith.extf %26 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%30 = arith.extf %27 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %29, %30, %28 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
%32 = vector.transfer_write %31, %extracted_slice_26[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %32 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %25 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %24 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %23 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %22 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_11 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%pack_12 = tensor.pack %extracted_slice_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%pack_14 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%13:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack_12[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_14[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = scf.for %arg7 = %c0 to %c1 step %c1 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1x8x8x4x4xf32>) { | |
%17 = scf.for %arg9 = %c0 to %c1 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x8x8x4x4xf32>) { | |
%18 = scf.for %arg11 = %c0 to %c1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x1x8x8x4x4xf32>) { | |
%19 = scf.for %arg13 = %c0 to %c8 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x8x8x4x4xf32>) { | |
%20 = scf.for %arg15 = %c0 to %c8 step %c1 iter_args(%arg16 = %arg14) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg17 = %c0 to %c4 step %c1 iter_args(%arg18 = %arg16) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_22 = tensor.extract_slice %pack_16[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_23 = tensor.extract_slice %pack_18[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_24 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%22 = vector.transfer_read %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16> | |
%23 = vector.transfer_read %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16> | |
%24 = vector.transfer_read %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xf32>, vector<1x1x1x1x4x4xf32> | |
%25 = arith.extf %22 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%26 = arith.extf %23 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%27 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %25, %26, %24 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
%28 = vector.transfer_write %27, %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %28 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %20 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %19 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %18 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %17 : tensor<1x1x8x8x4x4xf32> | |
} | |
%extracted_slice_20 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%unpack_21 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_20 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %13#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_7 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_5 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_4 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_3 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_2 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_2 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_5 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16> | |
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> -> tensor<128x128xf32> | |
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<128x128xf32>) { | |
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 256] [1, 1] : tensor<128x256xbf16> to tensor<64x256xbf16> | |
%extracted_slice_6 = tensor.extract_slice %4[0, %arg1] [256, 64] [1, 1] : tensor<256x128xbf16> to tensor<256x64xbf16> | |
%extracted_slice_7 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32> | |
%7 = bufferization.to_tensor %alloc_5 restrict writable : memref<2x2x32x32xf32, 1 : i32> | |
%8 = bufferization.to_tensor %alloc_4 restrict writable : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%extracted_slice_8 = tensor.extract_slice %extracted_slice[0, 0] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%9 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack = tensor.pack %extracted_slice_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_9 = tensor.extract_slice %extracted_slice_6[0, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%10 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_10 = tensor.pack %extracted_slice_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%11 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %8) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_10[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = linalg.fill ins(%cst_0 : f32) outs(%extracted_slice_19 : tensor<1x1x8x8x4x4xf32>) -> tensor<1x1x8x8x4x4xf32> | |
%17 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %16) -> (tensor<1x1x8x8x4x4xf32>) { | |
%18 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (tensor<1x1x8x8x4x4xf32>) { | |
%19 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xf32>) { | |
%20 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xf32>) { | |
%22 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_20 = tensor.extract_slice %pack_16[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_21 = tensor.extract_slice %pack_18[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_22 = tensor.extract_slice %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%23 = vector.transfer_read %extracted_slice_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16> | |
%24 = vector.transfer_read %extracted_slice_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16> | |
%25 = vector.transfer_read %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xf32>, vector<1x1x1x1x4x4xf32> | |
%26 = arith.extf %23 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%27 = arith.extf %24 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%28 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %26, %27, %25 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
%29 = vector.transfer_write %28, %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %29 into %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %22 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %20 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %19 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %18 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %17 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%12 = scf.for %arg3 = %c1 to %c7 step %c1 iter_args(%arg4 = %11) -> (tensor<2x2x8x8x4x4xf32>) { | |
%14 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3) | |
%extracted_slice_15 = tensor.extract_slice %extracted_slice[0, %14] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%15 = bufferization.to_tensor %alloc_3 restrict writable : memref<2x1x32x32xbf16, 1 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %15 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_17 = tensor.extract_slice %extracted_slice_6[%14, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%16 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x2x32x32xbf16, 1 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %16 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%17 = scf.forall (%arg5, %arg6) in (2, 2) shared_outs(%arg7 = %arg4) -> (tensor<2x2x8x8x4x4xf32>) { | |
%extracted_slice_19 = tensor.extract_slice %pack_16[%arg5, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%18 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_20 = tensor.pack %extracted_slice_19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %18 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_21 = tensor.extract_slice %pack_18[0, %arg6, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%19 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_22 = tensor.pack %extracted_slice_21 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %19 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_23 = tensor.extract_slice %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%20 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %extracted_slice_23) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (tensor<1x1x8x8x4x4xf32>) { | |
%22 = scf.for %arg12 = %c0 to %c1 step %c1 iter_args(%arg13 = %arg11) -> (tensor<1x1x8x8x4x4xf32>) { | |
%23 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (tensor<1x1x8x8x4x4xf32>) { | |
%24 = scf.for %arg16 = %c0 to %c8 step %c1 iter_args(%arg17 = %arg15) -> (tensor<1x1x8x8x4x4xf32>) { | |
%25 = scf.for %arg18 = %c0 to %c4 step %c1 iter_args(%arg19 = %arg17) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_24 = tensor.extract_slice %pack_20[%arg8, %arg12, %arg18, %arg14, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_25 = tensor.extract_slice %pack_22[%arg12, %arg10, %arg16, %arg18, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_26 = tensor.extract_slice %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%26 = vector.transfer_read %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16> | |
%27 = vector.transfer_read %extracted_slice_25[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16> | |
%28 = vector.transfer_read %extracted_slice_26[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xf32>, vector<1x1x1x1x4x4xf32> | |
%29 = arith.extf %26 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%30 = arith.extf %27 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %29, %30, %28 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
%32 = vector.transfer_write %31, %extracted_slice_26[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %32 into %arg19[%arg8, %arg10, %arg16, %arg14, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %25 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %24 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %23 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %22 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %20 into %arg7[%arg5, %arg6, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %17 : tensor<2x2x8x8x4x4xf32> | |
} | |
%extracted_slice_11 = tensor.extract_slice %extracted_slice[0, 224] [64, 32] [1, 1] : tensor<64x256xbf16> to tensor<64x32xbf16> | |
%pack_12 = tensor.pack %extracted_slice_11 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %9 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> | |
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[224, 0] [32, 64] [1, 1] : tensor<256x64xbf16> to tensor<32x64xbf16> | |
%pack_14 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %10 : tensor<32x64xbf16> -> tensor<1x2x32x32xbf16> | |
%13:2 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %12, %arg6 = %7) -> (tensor<2x2x8x8x4x4xf32>, tensor<2x2x32x32xf32>) { | |
%extracted_slice_15 = tensor.extract_slice %pack_12[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x1x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%14 = bufferization.to_tensor %alloc_1 restrict writable : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%pack_16 = tensor.pack %extracted_slice_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %14 : tensor<1x1x32x32xbf16> -> tensor<1x1x4x8x4x8xbf16> | |
%extracted_slice_17 = tensor.extract_slice %pack_14[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x2x32x32xbf16> to tensor<1x1x32x32xbf16> | |
%15 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%pack_18 = tensor.pack %extracted_slice_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x32x32xbf16> -> tensor<1x1x8x4x8x4xbf16> | |
%extracted_slice_19 = tensor.extract_slice %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<2x2x8x8x4x4xf32> to tensor<1x1x8x8x4x4xf32> | |
%16 = scf.for %arg7 = %c0 to %c1 step %c1 iter_args(%arg8 = %extracted_slice_19) -> (tensor<1x1x8x8x4x4xf32>) { | |
%17 = scf.for %arg9 = %c0 to %c1 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x8x8x4x4xf32>) { | |
%18 = scf.for %arg11 = %c0 to %c1 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x1x8x8x4x4xf32>) { | |
%19 = scf.for %arg13 = %c0 to %c8 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x1x8x8x4x4xf32>) { | |
%20 = scf.for %arg15 = %c0 to %c8 step %c1 iter_args(%arg16 = %arg14) -> (tensor<1x1x8x8x4x4xf32>) { | |
%21 = scf.for %arg17 = %c0 to %c4 step %c1 iter_args(%arg18 = %arg16) -> (tensor<1x1x8x8x4x4xf32>) { | |
%extracted_slice_22 = tensor.extract_slice %pack_16[%arg7, %arg11, %arg17, %arg13, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x4x8x4x8xbf16> to tensor<1x1x1x1x4x8xbf16> | |
%extracted_slice_23 = tensor.extract_slice %pack_18[%arg11, %arg9, %arg15, %arg17, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x8x4xbf16> to tensor<1x1x1x1x8x4xbf16> | |
%extracted_slice_24 = tensor.extract_slice %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> to tensor<1x1x1x1x4x4xf32> | |
%22 = vector.transfer_read %extracted_slice_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x8xbf16>, vector<1x1x1x1x4x8xbf16> | |
%23 = vector.transfer_read %extracted_slice_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x8x4xbf16>, vector<1x1x1x1x8x4xbf16> | |
%24 = vector.transfer_read %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : tensor<1x1x1x1x4x4xf32>, vector<1x1x1x1x4x4xf32> | |
%25 = arith.extf %22 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%26 = arith.extf %23 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%27 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %25, %26, %24 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
%28 = vector.transfer_write %27, %extracted_slice_24[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, tensor<1x1x1x1x4x4xf32> | |
%inserted_slice = tensor.insert_slice %28 into %arg18[%arg7, %arg9, %arg15, %arg13, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x4x4xf32> into tensor<1x1x8x8x4x4xf32> | |
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %21 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %20 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %19 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %18 : tensor<1x1x8x8x4x4xf32> | |
} | |
scf.yield %17 : tensor<1x1x8x8x4x4xf32> | |
} | |
%extracted_slice_20 = tensor.extract_slice %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<2x2x32x32xf32> to tensor<1x1x32x32xf32> | |
%unpack_21 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_20 : tensor<1x1x8x8x4x4xf32> -> tensor<1x1x32x32xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack_21 into %arg6[%arg3, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xf32> into tensor<2x2x32x32xf32> | |
tensor.parallel_insert_slice %16 into %arg5[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xf32> into tensor<2x2x8x8x4x4xf32> | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%unpack = tensor.unpack %13#1 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %extracted_slice_7 : tensor<2x2x32x32xf32> -> tensor<64x64xf32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32> | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xf32>> | |
memref.dealloc %alloc_5 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_4 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_3 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_2 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_2 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_5 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_9 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_3 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_9 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_9 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_10 = memref.subview %subview_6[0, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_11 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_2 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc_11 : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_11 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_15 = memref.subview %alloc_9[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_11[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst_0 : f32) outs(%subview_17 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
%4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_17) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%7 = scf.for %arg10 = %c0 to %c8 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%9 = scf.for %arg14 = %c0 to %c4 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%subview_19 = memref.subview %alloc_1[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_21 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%10 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %subview_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_22 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_21 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_22 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
scf.yield %arg15 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %9 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %8 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %7 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %6 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %5 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
%subview_18 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%3 = scf.for %arg2 = %c1 to %c7 step %c1 iter_args(%arg3 = %alloc_4) -> (memref<2x2x8x8x4x4xf32, 2 : i32>) { | |
%4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_15 = memref.subview %subview[0, %4] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_16 = memref.subview %subview_6[%4, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg4, %arg5) in (2, 2) { | |
%subview_17 = memref.subview %alloc_3[%arg4, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_18 = memref.subview %alloc_2[0, %arg5, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_19 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %subview_19) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%7 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%9 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%10 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%subview_21 = memref.subview %alloc_1[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_22 = memref.subview %alloc[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_23 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%11 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%12 = vector.transfer_read %subview_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%13 = vector.transfer_read %subview_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%15 = arith.extf %12 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%16 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %13 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %16, %subview_23[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_24 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_23 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_24 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
scf.yield %arg17 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %10 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %9 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %8 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %7 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %6 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
%subview_20 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%5 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %arg3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
%subview_12 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_13 = memref.subview %subview_6[224, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_15 = memref.subview %alloc_3[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_2[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_17 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_17) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%7 = scf.for %arg10 = %c0 to %c8 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%9 = scf.for %arg14 = %c0 to %c4 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%subview_21 = memref.subview %alloc_1[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_22 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_23 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%10 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %subview_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %subview_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %subview_23[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_24 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_23 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_24 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
scf.yield %arg15 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %9 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %8 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %7 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %6 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %5 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
%subview_18 = memref.subview %alloc_5[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_18 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
%subview_19 = memref.subview %alloc_5[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) outs(%subview_19 : memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
%subview_20 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_5 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_7 : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
%subview_14 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_7 : memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_14 : memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
memref.dealloc %alloc_5 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_4 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_3 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_2 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_2 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_5 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_9 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_3 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_9 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_9 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_10 = memref.subview %subview_6[0, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_11 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_2 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc_11 : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_11 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_15 = memref.subview %alloc_9[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_11[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst_0 : f32) outs(%subview_17 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
%4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_17) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%7 = scf.for %arg10 = %c0 to %c8 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%9 = scf.for %arg14 = %c0 to %c4 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%subview_19 = memref.subview %alloc_1[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_21 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%10 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %subview_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_22 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_21 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_22 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
scf.yield %arg15 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %9 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %8 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %7 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %6 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %5 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
%subview_18 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%3 = scf.for %arg2 = %c1 to %c7 step %c1 iter_args(%arg3 = %alloc_4) -> (memref<2x2x8x8x4x4xf32, 2 : i32>) { | |
%4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_15 = memref.subview %subview[0, %4] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_16 = memref.subview %subview_6[%4, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg4, %arg5) in (2, 2) { | |
%subview_17 = memref.subview %alloc_3[%arg4, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_18 = memref.subview %alloc_2[0, %arg5, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_19 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %subview_19) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%7 = scf.for %arg10 = %c0 to %c1 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%9 = scf.for %arg14 = %c0 to %c8 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%10 = scf.for %arg16 = %c0 to %c4 step %c1 iter_args(%arg17 = %arg15) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%subview_21 = memref.subview %alloc_1[%arg6, %arg10, %arg16, %arg12, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_22 = memref.subview %alloc[%arg10, %arg8, %arg14, %arg16, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_23 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%11 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%12 = vector.transfer_read %subview_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%13 = vector.transfer_read %subview_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%15 = arith.extf %12 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%16 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %13 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %16, %subview_23[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_24 = memref.subview %arg17[%arg6, %arg8, %arg14, %arg12, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_23 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_24 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
scf.yield %arg17 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %10 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %9 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %8 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %7 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %6 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
%subview_20 = memref.subview %arg3[%arg4, %arg5, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%5 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.yield %arg3 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
%subview_12 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_13 = memref.subview %subview_6[224, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_15 = memref.subview %alloc_3[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_2[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_17 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%4 = scf.for %arg4 = %c0 to %c1 step %c1 iter_args(%arg5 = %subview_17) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%5 = scf.for %arg6 = %c0 to %c1 step %c1 iter_args(%arg7 = %arg5) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%6 = scf.for %arg8 = %c0 to %c1 step %c1 iter_args(%arg9 = %arg7) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%7 = scf.for %arg10 = %c0 to %c8 step %c1 iter_args(%arg11 = %arg9) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%8 = scf.for %arg12 = %c0 to %c8 step %c1 iter_args(%arg13 = %arg11) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%9 = scf.for %arg14 = %c0 to %c4 step %c1 iter_args(%arg15 = %arg13) -> (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
%subview_21 = memref.subview %alloc_1[%arg4, %arg8, %arg14, %arg10, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_22 = memref.subview %alloc[%arg8, %arg6, %arg12, %arg14, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_23 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%10 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %subview_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %subview_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %subview_23[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_24 = memref.subview %arg15[%arg4, %arg6, %arg12, %arg10, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_23 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_24 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
scf.yield %arg15 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %9 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %8 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %7 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %6 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
scf.yield %5 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
%subview_18 = memref.subview %alloc_5[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %4 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_18 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
%subview_19 = memref.subview %alloc_5[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) outs(%subview_19 : memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
%subview_20 = memref.subview %3[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_5 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_7 : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
%subview_14 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_7 : memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_14 : memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
memref.dealloc %alloc_5 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_4 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_3 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_2 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_2 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_5 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_9 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_3 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_9 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_9 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_10 = memref.subview %subview_6[0, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_11 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_2 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc_11 : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_11 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_15 = memref.subview %alloc_9[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_11[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst_0 : f32) outs(%subview_17 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_19 = memref.subview %alloc_1[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_21 = memref.subview %subview_17[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_22 = memref.subview %subview_17[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_21 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_22 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} | |
} | |
} | |
%subview_18 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_17 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c1 to %c7 step %c1 { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_15 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_15 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_16 = memref.subview %subview_6[%3, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_17 = memref.subview %alloc_3[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_18 = memref.subview %alloc_2[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_19 = memref.subview %alloc_4[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%subview_21 = memref.subview %alloc_1[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_22 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_23 = memref.subview %subview_19[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%4 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%5 = vector.transfer_read %subview_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%6 = vector.transfer_read %subview_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%8 = arith.extf %5 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %6 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %9, %subview_23[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_24 = memref.subview %subview_19[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_23 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_24 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} | |
} | |
} | |
%subview_20 = memref.subview %alloc_4[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_19 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_12 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_13 = memref.subview %subview_6[224, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_15 = memref.subview %alloc_3[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_2[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_21 = memref.subview %alloc_1[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_22 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_23 = memref.subview %subview_17[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_22[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_23[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_23[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_24 = memref.subview %subview_17[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_23 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_24 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} | |
} | |
} | |
%subview_18 = memref.subview %alloc_5[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_18 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
%subview_19 = memref.subview %alloc_5[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) outs(%subview_19 : memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
%subview_20 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_17 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_5 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_7 : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
%subview_14 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_7 : memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_14 : memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_5 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_4 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_3 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_2 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_2 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_5 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_9 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_3 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_9 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_9 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_10 = memref.subview %subview_6[0, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_11 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_2 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc_11 : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_11 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_9[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_11[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst_0 : f32) outs(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_17 = memref.subview %alloc_1[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_18 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_19[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_19 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_19 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} | |
} | |
} | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c1 to %c7 step %c1 { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_14 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_15 = memref.subview %subview_6[%3, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_16 = memref.subview %alloc_3[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_2[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_18 = memref.subview %alloc_4[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%subview_19 = memref.subview %alloc_1[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_21 = memref.subview %subview_18[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%6 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%8 = arith.extf %5 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %6 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %9, %subview_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_21 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_21 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} | |
} | |
} | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_18 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_18 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_12 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_13 = memref.subview %subview_6[224, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_3[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_2[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_18 = memref.subview %alloc_1[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_20 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_20 : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} | |
} | |
} | |
%subview_17 = memref.subview %alloc_5[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_17 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_17 : memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) outs(%subview_17 : memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) outs(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_5 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_7 : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_7 : memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_7 : memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) { | |
^bb0(%in: f32, %out: f32): | |
linalg.yield %in : f32 | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_5 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_4 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_3 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_2 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_2 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_5 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_9 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_3 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_9 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_9 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_10 = memref.subview %subview_6[0, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_11 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_2 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc_11 : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_11 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_9[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_11[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst_0 : f32) outs(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_17 = memref.subview %alloc_1[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_18 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_19[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c1 to %c7 step %c1 { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_14 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_15 = memref.subview %subview_6[%3, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_16 = memref.subview %alloc_3[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_2[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_18 = memref.subview %alloc_4[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%subview_19 = memref.subview %alloc_1[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_21 = memref.subview %subview_18[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%6 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%8 = arith.extf %5 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %6 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %9, %subview_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_12 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_13 = memref.subview %subview_6[224, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_3[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_2[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_18 = memref.subview %alloc_1[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
%subview_17 = memref.subview %alloc_5[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_17 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_5 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_7 : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_5 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_4 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_3 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_2 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- // | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%alloc = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_2 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_3 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_4 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_5 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_6 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_9 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_3 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_9 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_9 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_10 = memref.subview %subview_6[0, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc_11 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_2 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc_11 : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_11 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_9[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_11[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst_0 : f32) outs(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_17 = memref.subview %alloc_1[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_18 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_19[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c1 to %c7 step %c1 { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_14 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_15 = memref.subview %subview_6[%3, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_16 = memref.subview %alloc_3[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_2[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_18 = memref.subview %alloc_4[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%subview_19 = memref.subview %alloc_1[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %alloc[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_21 = memref.subview %subview_18[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%6 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%8 = arith.extf %5 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %6 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %9, %subview_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_12 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_3 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_13 = memref.subview %subview_6[224, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_2 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_3[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_1 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_2[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_4[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_18 = memref.subview %alloc_1[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %alloc[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
%subview_17 = memref.subview %alloc_5[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_17 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_5 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_7 : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_5 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_4 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_3 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_2 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_1 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
return | |
} | |
// -----// IR Dump Before LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_0 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_10 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_0 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_0 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_11 = memref.subview %subview_8[0, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_0[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst_1 : f32) outs(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_17 = memref.subview %alloc_3[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_18 = memref.subview %alloc_2[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_19[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c1 to %c7 step %c1 { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_14 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_5 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_15 = memref.subview %subview_8[%3, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_4 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_16 = memref.subview %alloc_5[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_4[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_18 = memref.subview %alloc_6[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%subview_19 = memref.subview %alloc_3[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %alloc_2[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_21 = memref.subview %subview_18[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%6 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%8 = arith.extf %5 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %6 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %9, %subview_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_12 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_5 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_13 = memref.subview %subview_8[224, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_4 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_5[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_4[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_18 = memref.subview %alloc_3[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %alloc_2[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
%subview_17 = memref.subview %alloc_7[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_17 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_9 : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_0 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_0 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xbf16, #hal.descriptor_type<storage_buffer>> to memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_8 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xbf16, #hal.descriptor_type<storage_buffer>> to memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32, #hal.descriptor_type<storage_buffer>> to memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_10 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_0 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_0 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_11 = memref.subview %subview_8[0, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_0[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst_1 : f32) outs(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_17 = memref.subview %alloc_3[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_18 = memref.subview %alloc_2[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_19[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c1 to %c7 step %c1 { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_14 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_5 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_15 = memref.subview %subview_8[%3, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_4 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_16 = memref.subview %alloc_5[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_4[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_18 = memref.subview %alloc_6[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%subview_19 = memref.subview %alloc_3[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %alloc_2[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_21 = memref.subview %subview_18[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%6 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%8 = arith.extf %5 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %6 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %9, %subview_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_12 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_5 : (memref<64x32xbf16, strided<[256, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_13 = memref.subview %subview_8[224, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_4 : (memref<32x64xbf16, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_5[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_4[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_18 = memref.subview %alloc_3[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %alloc_2[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
%subview_17 = memref.subview %alloc_7[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_17 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_9 : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_0 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_0 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c7 = arith.constant 7 : index | |
%cst_1 = arith.constant 0.000000e+00 : f32 | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
memref.assume_alignment %2, 64 : memref<128x128xf32> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %0[%arg0, 0] [64, 256] [1, 1] : memref<128x256xbf16> to memref<64x256xbf16, strided<[256, 1], offset: ?>> | |
%subview_8 = memref.subview %1[0, %arg1] [256, 64] [1, 1] : memref<256x128xbf16> to memref<256x64xbf16, strided<[128, 1], offset: ?>> | |
%subview_9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, strided<[128, 1], offset: ?>> | |
%subview_10 = memref.subview %subview[0, 0] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_0 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_0 : (memref<64x32xbf16, strided<[256, 1], offset: ?>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_11 = memref.subview %subview_8[0, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc : (memref<32x64xbf16, strided<[128, 1], offset: ?>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_0[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst_1 : f32) outs(%subview_16 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_17 = memref.subview %alloc_3[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_18 = memref.subview %alloc_2[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_17[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_19[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c1 to %c7 step %c1 { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_14 = memref.subview %subview[0, %3] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
iree_linalg_ext.pack %subview_14 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_5 : (memref<64x32xbf16, strided<[256, 1], offset: ?>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_15 = memref.subview %subview_8[%3, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_4 : (memref<32x64xbf16, strided<[128, 1], offset: ?>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_16 = memref.subview %alloc_5[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_17 = memref.subview %alloc_4[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_17 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_18 = memref.subview %alloc_6[%arg3, %arg4, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%subview_19 = memref.subview %alloc_3[0, 0, %arg7, %arg5, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %alloc_2[0, 0, %arg6, %arg7, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_21 = memref.subview %subview_18[0, 0, %arg6, %arg5, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%6 = vector.transfer_read %subview_21[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%8 = arith.extf %5 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %6 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %9, %subview_21[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_12 = memref.subview %subview[0, 224] [64, 32] [1, 1] : memref<64x256xbf16, strided<[256, 1], offset: ?>> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_5 : (memref<64x32xbf16, strided<[256, 1], offset: ?>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_13 = memref.subview %subview_8[224, 0] [32, 64] [1, 1] : memref<256x64xbf16, strided<[128, 1], offset: ?>> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_4 : (memref<32x64xbf16, strided<[128, 1], offset: ?>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_14 = memref.subview %alloc_5[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_4[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_16 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%subview_18 = memref.subview %alloc_3[0, 0, %arg6, %arg4, 0, 0] [1, 1, 1, 1, 4, 8] [1, 1, 1, 1, 1, 1] : memref<1x1x4x8x4x8xbf16, 2 : i32> to memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %alloc_2[0, 0, %arg5, %arg6, 0, 0] [1, 1, 1, 1, 8, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x4x8x4xbf16, 2 : i32> to memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32> | |
%subview_20 = memref.subview %subview_16[0, 0, %arg5, %arg4, 0, 0] [1, 1, 1, 1, 4, 4] [1, 1, 1, 1, 1, 1] : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> to memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%3 = vector.transfer_read %subview_18[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x8xbf16, strided<[1024, 1024, 256, 32, 8, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %subview_19[%c0, %c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x8x4xbf16, strided<[1024, 1024, 128, 32, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %subview_20[%c0, %c0, %c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %subview_20[%c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x1x1x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
} | |
} | |
} | |
%subview_17 = memref.subview %alloc_7[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %subview_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_17 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview_9 : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_0 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before AMDAIEPackToDma (iree-amdaie-pack-to-dma) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_1 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
memref.assume_alignment %0, 64 : memref<128x256xbf16> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
memref.assume_alignment %1, 64 : memref<256x128xbf16> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
memref.assume_alignment %2, 64 : memref<128x128xf32> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, strided<[128, 1], offset: ?>> | |
%subview_8 = memref.subview %0[%arg0, 0] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_1 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_8 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_1 : (memref<64x32xbf16, strided<[256, 1], offset: ?>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_9 = memref.subview %1[0, %arg1] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
iree_linalg_ext.pack %subview_9 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc : (memref<32x64xbf16, strided<[128, 1], offset: ?>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_12 = memref.subview %alloc_1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_13 = memref.subview %alloc[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_14 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%subview_14 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%3 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c1 to %c7 step %c1 { | |
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_12 = memref.subview %0[%arg0, %3] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
iree_linalg_ext.pack %subview_12 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_5 : (memref<64x32xbf16, strided<[256, 1], offset: ?>> memref<2x1x32x32xbf16, 1 : i32>) | |
%4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_13 = memref.subview %1[%4, %arg1] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_4 : (memref<32x64xbf16, strided<[128, 1], offset: ?>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_14 = memref.subview %alloc_5[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_15 = memref.subview %alloc_4[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%5 = vector.transfer_read %alloc_3[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%6 = vector.transfer_read %alloc_2[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%7 = vector.transfer_read %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%8 = arith.extf %5 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%9 = arith.extf %6 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%10 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %9, %7 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %10, %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_10 = memref.subview %0[%arg0, 224] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
iree_linalg_ext.pack %subview_10 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_5 : (memref<64x32xbf16, strided<[256, 1], offset: ?>> memref<2x1x32x32xbf16, 1 : i32>) | |
%subview_11 = memref.subview %1[224, %arg1] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
iree_linalg_ext.pack %subview_11 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %alloc_4 : (memref<32x64xbf16, strided<[128, 1], offset: ?>> memref<1x2x32x32xbf16, 1 : i32>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_12 = memref.subview %alloc_5[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_3 : (memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xbf16, 2 : i32>) | |
%subview_13 = memref.subview %alloc_4[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.pack %subview_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_2 : (memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xbf16, 2 : i32>) | |
%subview_14 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%3 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%4 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%5 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%6 = arith.extf %3 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%7 = arith.extf %4 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %5 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %8, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
%subview_15 = memref.subview %alloc_7[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
iree_linalg_ext.unpack %subview_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_15 : (memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
iree_linalg_ext.unpack %alloc_7 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %subview : (memref<2x2x32x32xf32, 1 : i32> memref<64x64xf32, strided<[128, 1], offset: ?>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_1 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before AMDAIENormalizeLoopBounds (iree-amdaie-normalize-loop-bounds) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%1 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_1 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%3 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%4 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%5 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%8 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%10 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%11 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%12 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%13 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%15 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%16 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%17 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%18 = amdaie.logicalobjectfifo.from_memref %alloc_6, {} : memref<2x2x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%19 = amdaie.logicalobjectfifo.from_memref %alloc_7, {} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%20 = amdaie.logicalobjectfifo.from_memref %alloc_7, {} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%21 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
%22 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
%23 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
%24 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
memref.assume_alignment %21, 64 : memref<128x256xbf16> | |
%25 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
%26 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
%27 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
%28 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
memref.assume_alignment %25, 64 : memref<256x128xbf16> | |
%29 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
%30 = amdaie.logicalobjectfifo.from_memref %29, {} : memref<128x128xf32> -> !amdaie.logicalobjectfifo<memref<128x128xf32>> | |
memref.assume_alignment %29, 64 : memref<128x128xf32> | |
scf.forall (%arg0, %arg1) = (0, 0) to (128, 128) step (64, 64) { | |
%subview = memref.subview %29[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, strided<[128, 1], offset: ?>> | |
%subview_8 = memref.subview %21[%arg0, 0] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_1 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%31 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %24[0, 0, %arg0, 0] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%subview_9 = memref.subview %25[0, %arg1] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%32 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %28[0, 0, 0, %arg1] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_12 = memref.subview %alloc_1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%36 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %2[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_13 = memref.subview %alloc[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%37 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%subview_14 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%subview_14 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%38 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%39 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%40 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%41 = arith.extf %38 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%42 = arith.extf %39 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%43 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %42, %40 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %43, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
scf.for %arg2 = %c1 to %c7 step %c1 { | |
%36 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_12 = memref.subview %21[%arg0, %36] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
%37 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %23[0, 0, %arg0, %36] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%38 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2) | |
%subview_13 = memref.subview %25[%38, %arg1] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
%39 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %27[0, 0, %38, %arg1] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_14 = memref.subview %alloc_5[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%40 = amdaie.dma_cpy_nd(%8[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %16[%arg3, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_15 = memref.subview %alloc_4[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%41 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %12[0, %arg4, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%42 = vector.transfer_read %alloc_3[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%43 = vector.transfer_read %alloc_2[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%44 = vector.transfer_read %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%45 = arith.extf %42 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%46 = arith.extf %43 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %44 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %47, %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_10 = memref.subview %21[%arg0, 224] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
%33 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %22[0, 0, %arg0, 224] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%subview_11 = memref.subview %25[224, %arg1] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
%34 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %26[0, 0, 224, %arg1] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_12 = memref.subview %alloc_5[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%36 = amdaie.dma_cpy_nd(%7[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %14[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_13 = memref.subview %alloc_4[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%37 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %10[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%subview_14 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%39 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%40 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%41 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%42 = arith.extf %39 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%43 = arith.extf %40 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %43, %41 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %44, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
%subview_15 = memref.subview %alloc_7[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%38 = amdaie.dma_cpy_nd(%20[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %18[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%35 = amdaie.dma_cpy_nd(%30[%arg0, %arg1] [64, 64] [128, 1], %19[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xf32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_1 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before AMDAIEInsertCores (iree-amdaie-insert-cores) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%1 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_1 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%3 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%4 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%5 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%8 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%10 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%11 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%12 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%13 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%15 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%16 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%17 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%18 = amdaie.logicalobjectfifo.from_memref %alloc_6, {} : memref<2x2x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%19 = amdaie.logicalobjectfifo.from_memref %alloc_7, {} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%20 = amdaie.logicalobjectfifo.from_memref %alloc_7, {} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%21 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
%22 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
%23 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
%24 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
memref.assume_alignment %21, 64 : memref<128x256xbf16> | |
%25 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
%26 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
%27 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
%28 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
memref.assume_alignment %25, 64 : memref<256x128xbf16> | |
%29 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
%30 = amdaie.logicalobjectfifo.from_memref %29, {} : memref<128x128xf32> -> !amdaie.logicalobjectfifo<memref<128x128xf32>> | |
memref.assume_alignment %29, 64 : memref<128x128xf32> | |
%c2 = arith.constant 2 : index | |
%c1_8 = arith.constant 1 : index | |
%c2_9 = arith.constant 2 : index | |
%c1_10 = arith.constant 1 : index | |
scf.forall (%arg0, %arg1) in (2, 2) { | |
%31 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1) | |
%32 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0) | |
%subview = memref.subview %29[%32, %31] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, strided<[128, 1], offset: ?>> | |
%subview_11 = memref.subview %21[%32, 0] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_1 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%33 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %24[0, 0, %32, 0] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%subview_12 = memref.subview %25[0, %31] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%34 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %28[0, 0, 0, %31] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_16 = memref.subview %alloc_1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%38 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %2[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_17 = memref.subview %alloc[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%39 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%subview_18 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%subview_18 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%40 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%41 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%42 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%43 = arith.extf %40 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%44 = arith.extf %41 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %43, %44, %42 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %45, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%c6 = arith.constant 6 : index | |
%c0_13 = arith.constant 0 : index | |
scf.for %arg2 = %c0_13 to %c6 step %c1 { | |
%38 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
%39 = affine.apply affine_map<(d0) -> (d0 * 32)>(%38) | |
%subview_16 = memref.subview %21[%32, %39] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
%40 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %23[0, 0, %32, %39] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%41 = affine.apply affine_map<(d0) -> (d0 * 32)>(%38) | |
%subview_17 = memref.subview %25[%41, %31] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
%42 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %27[0, 0, %41, %31] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_18 = memref.subview %alloc_5[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%43 = amdaie.dma_cpy_nd(%8[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %16[%arg3, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_19 = memref.subview %alloc_4[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%44 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %12[0, %arg4, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%45 = vector.transfer_read %alloc_3[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%46 = vector.transfer_read %alloc_2[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%47 = vector.transfer_read %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%48 = arith.extf %45 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%49 = arith.extf %46 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %49, %47 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %50, %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_14 = memref.subview %21[%32, 224] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
%35 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %22[0, 0, %32, 224] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%subview_15 = memref.subview %25[224, %31] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
%36 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %26[0, 0, 224, %31] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_16 = memref.subview %alloc_5[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%38 = amdaie.dma_cpy_nd(%7[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %14[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_17 = memref.subview %alloc_4[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%39 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %10[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%subview_18 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%41 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%42 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%43 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%44 = arith.extf %41 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%45 = arith.extf %42 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%46 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %45, %43 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %46, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
%subview_19 = memref.subview %alloc_7[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%40 = amdaie.dma_cpy_nd(%20[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %18[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%37 = amdaie.dma_cpy_nd(%30[%32, %31] [64, 64] [128, 1], %19[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xf32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_1 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before AMDAIELocalizeLogicalObjectfifo (iree-amdaie-localize-logicalobjectfifo) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%1 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_1 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%3 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%4 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%5 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%8 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%10 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%11 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%12 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%13 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%14 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%15 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%16 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%17 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%18 = amdaie.logicalobjectfifo.from_memref %alloc_6, {} : memref<2x2x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%19 = amdaie.logicalobjectfifo.from_memref %alloc_7, {} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%20 = amdaie.logicalobjectfifo.from_memref %alloc_7, {} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%21 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
%22 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
%23 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
%24 = amdaie.logicalobjectfifo.from_memref %21, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
memref.assume_alignment %21, 64 : memref<128x256xbf16> | |
%25 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
%26 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
%27 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
%28 = amdaie.logicalobjectfifo.from_memref %25, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
memref.assume_alignment %25, 64 : memref<256x128xbf16> | |
%29 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
%30 = amdaie.logicalobjectfifo.from_memref %29, {} : memref<128x128xf32> -> !amdaie.logicalobjectfifo<memref<128x128xf32>> | |
memref.assume_alignment %29, 64 : memref<128x128xf32> | |
%c2 = arith.constant 2 : index | |
%c1_8 = arith.constant 1 : index | |
%c2_9 = arith.constant 2 : index | |
%c1_10 = arith.constant 1 : index | |
scf.forall (%arg0, %arg1) in (2, 2) { | |
%31 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1) | |
%32 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0) | |
%subview = memref.subview %29[%32, %31] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, strided<[128, 1], offset: ?>> | |
%subview_11 = memref.subview %21[%32, 0] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_1 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%33 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %24[0, 0, %32, 0] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%subview_12 = memref.subview %25[0, %31] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%34 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %28[0, 0, 0, %31] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_16 = memref.subview %alloc_1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%38 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %2[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_17 = memref.subview %alloc[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%39 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%subview_18 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%c2_19 = arith.constant 2 : index | |
%40 = arith.addi %arg2, %c2_19 : index | |
%tile = amdaie.tile(%arg3, %40) | |
%41 = amdaie.core(%tile, in : [%38, %39], out : []) { | |
linalg.fill ins(%cst : f32) outs(%subview_18 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%42 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%43 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%44 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%45 = arith.extf %42 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%46 = arith.extf %43 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %46, %44 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %47, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%c6 = arith.constant 6 : index | |
%c0_13 = arith.constant 0 : index | |
scf.for %arg2 = %c0_13 to %c6 step %c1 { | |
%38 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
%39 = affine.apply affine_map<(d0) -> (d0 * 32)>(%38) | |
%subview_16 = memref.subview %21[%32, %39] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
%40 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %23[0, 0, %32, %39] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%41 = affine.apply affine_map<(d0) -> (d0 * 32)>(%38) | |
%subview_17 = memref.subview %25[%41, %31] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
%42 = amdaie.dma_cpy_nd(%13[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %27[0, 0, %41, %31] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_18 = memref.subview %alloc_5[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%43 = amdaie.dma_cpy_nd(%8[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %16[%arg3, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_19 = memref.subview %alloc_4[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%44 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %12[0, %arg4, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%c2_20 = arith.constant 2 : index | |
%45 = arith.addi %arg3, %c2_20 : index | |
%tile = amdaie.tile(%arg4, %45) | |
%46 = amdaie.core(%tile, in : [%43, %44], out : []) { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%47 = vector.transfer_read %alloc_3[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%48 = vector.transfer_read %alloc_2[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%49 = vector.transfer_read %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%50 = arith.extf %47 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%51 = arith.extf %48 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %50, %51, %49 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %52, %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_14 = memref.subview %21[%32, 224] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
%35 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %22[0, 0, %32, 224] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%subview_15 = memref.subview %25[224, %31] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
%36 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %26[0, 0, 224, %31] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_16 = memref.subview %alloc_5[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%38 = amdaie.dma_cpy_nd(%7[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %14[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_17 = memref.subview %alloc_4[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%39 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %10[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%subview_18 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %alloc_7[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%40 = amdaie.dma_cpy_nd(%20[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %18[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>>) | |
%c2_20 = arith.constant 2 : index | |
%41 = arith.addi %arg2, %c2_20 : index | |
%tile = amdaie.tile(%arg3, %41) | |
%42 = amdaie.core(%tile, in : [%38, %39], out : [%40]) { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%43 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%44 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%45 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%46 = arith.extf %43 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%47 = arith.extf %44 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %47, %45 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %48, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%37 = amdaie.dma_cpy_nd(%30[%32, %31] [64, 64] [128, 1], %19[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xf32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_1 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c7 = arith.constant 7 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_1 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%4 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%5 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%6 = amdaie.logicalobjectfifo.from_memref %alloc_7, {} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%7 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
%8 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
%9 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
%10 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
memref.assume_alignment %7, 64 : memref<128x256xbf16> | |
%11 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
%12 = amdaie.logicalobjectfifo.from_memref %11, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
%13 = amdaie.logicalobjectfifo.from_memref %11, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
%14 = amdaie.logicalobjectfifo.from_memref %11, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
memref.assume_alignment %11, 64 : memref<256x128xbf16> | |
%15 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
%16 = amdaie.logicalobjectfifo.from_memref %15, {} : memref<128x128xf32> -> !amdaie.logicalobjectfifo<memref<128x128xf32>> | |
memref.assume_alignment %15, 64 : memref<128x128xf32> | |
%c2 = arith.constant 2 : index | |
%c1_8 = arith.constant 1 : index | |
%c2_9 = arith.constant 2 : index | |
%c1_10 = arith.constant 1 : index | |
scf.forall (%arg0, %arg1) in (2, 2) { | |
%17 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1) | |
%18 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0) | |
%subview = memref.subview %15[%18, %17] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, strided<[128, 1], offset: ?>> | |
%subview_11 = memref.subview %7[%18, 0] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_1 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%19 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %10[0, 0, %18, 0] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%subview_12 = memref.subview %11[0, %17] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%20 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %14[0, 0, 0, %17] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%21 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%22 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%23 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%24 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_16 = memref.subview %alloc_1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%34 = amdaie.dma_cpy_nd(%24[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %22[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_17 = memref.subview %alloc[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%35 = amdaie.dma_cpy_nd(%23[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %21[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%subview_18 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%c2_19 = arith.constant 2 : index | |
%36 = arith.addi %arg2, %c2_19 : index | |
%tile = amdaie.tile(%arg3, %36) | |
%37 = amdaie.core(%tile, in : [%34, %35], out : []) { | |
linalg.fill ins(%cst : f32) outs(%subview_18 : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%38 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%39 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%40 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%41 = arith.extf %38 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%42 = arith.extf %39 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%43 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %42, %40 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %43, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%c6 = arith.constant 6 : index | |
%c0_13 = arith.constant 0 : index | |
scf.for %arg2 = %c0_13 to %c6 step %c1 { | |
%34 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
%35 = affine.apply affine_map<(d0) -> (d0 * 32)>(%34) | |
%subview_16 = memref.subview %7[%18, %35] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
%36 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %9[0, 0, %18, %35] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%37 = affine.apply affine_map<(d0) -> (d0 * 32)>(%34) | |
%subview_17 = memref.subview %11[%37, %17] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
%38 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %13[0, 0, %37, %17] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%39 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%40 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%41 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%42 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%subview_18 = memref.subview %alloc_5[%arg3, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%43 = amdaie.dma_cpy_nd(%40[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %42[%arg3, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_19 = memref.subview %alloc_4[0, %arg4, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%44 = amdaie.dma_cpy_nd(%39[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %41[0, %arg4, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%c2_20 = arith.constant 2 : index | |
%45 = arith.addi %arg3, %c2_20 : index | |
%tile = amdaie.tile(%arg4, %45) | |
%46 = amdaie.core(%tile, in : [%43, %44], out : []) { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%47 = vector.transfer_read %alloc_3[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%48 = vector.transfer_read %alloc_2[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%49 = vector.transfer_read %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%50 = arith.extf %47 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%51 = arith.extf %48 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %50, %51, %49 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %52, %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%subview_14 = memref.subview %7[%18, 224] [64, 32] [1, 1] : memref<128x256xbf16> to memref<64x32xbf16, strided<[256, 1], offset: ?>> | |
%25 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %8[0, 0, %18, 224] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%subview_15 = memref.subview %11[224, %17] [32, 64] [1, 1] : memref<256x128xbf16> to memref<32x64xbf16, strided<[128, 1], offset: ?>> | |
%26 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %12[0, 0, 224, %17] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%27 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%28 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%29 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%30 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%31 = amdaie.logicalobjectfifo.from_memref %alloc_6, {} : memref<2x2x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>> | |
%32 = amdaie.logicalobjectfifo.from_memref %alloc_7, {} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%subview_16 = memref.subview %alloc_5[%arg2, 0, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x1x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[1024, 1024, 32, 1], offset: ?>, 1 : i32> | |
%34 = amdaie.dma_cpy_nd(%28[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %30[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%subview_17 = memref.subview %alloc_4[0, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x2x32x32xbf16, 1 : i32> to memref<1x1x32x32xbf16, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%35 = amdaie.dma_cpy_nd(%27[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %29[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%subview_18 = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%subview_19 = memref.subview %alloc_7[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<2x2x32x32xf32, 1 : i32> to memref<1x1x32x32xf32, strided<[2048, 1024, 32, 1], offset: ?>, 1 : i32> | |
%36 = amdaie.dma_cpy_nd(%32[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %31[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>>) | |
%c2_20 = arith.constant 2 : index | |
%37 = arith.addi %arg2, %c2_20 : index | |
%tile = amdaie.tile(%arg3, %37) | |
%38 = amdaie.core(%tile, in : [%34, %35], out : [%36]) { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%39 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%40 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%41 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%42 = arith.extf %39 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%43 = arith.extf %40 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %43, %41 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %44, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%33 = amdaie.dma_cpy_nd(%16[%18, %17] [64, 64] [128, 1], %6[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xf32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_1 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before AMDAIEDistributeCoresAndObjectFifos (iree-amdaie-distribute-cores-and-objectfifos) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_1 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%3 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_6 = memref.alloc() : memref<2x2x8x8x4x4xf32, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%4 = amdaie.logicalobjectfifo.from_memref %alloc_7, {} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%5 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
%6 = amdaie.logicalobjectfifo.from_memref %5, {} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
memref.assume_alignment %5, 64 : memref<128x256xbf16> | |
%7 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
%8 = amdaie.logicalobjectfifo.from_memref %7, {} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
memref.assume_alignment %7, 64 : memref<256x128xbf16> | |
%9 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
%10 = amdaie.logicalobjectfifo.from_memref %9, {} : memref<128x128xf32> -> !amdaie.logicalobjectfifo<memref<128x128xf32>> | |
memref.assume_alignment %9, 64 : memref<128x128xf32> | |
scf.forall (%arg0, %arg1) in (2, 2) { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1) | |
%12 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_1 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%13 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, 0] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%14 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, 0, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%16 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%21 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %1[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%22 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%subview = memref.subview %alloc_6[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : memref<2x2x8x8x4x4xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32> | |
%c2 = arith.constant 2 : index | |
%23 = arith.addi %arg2, %c2 : index | |
%tile = amdaie.tile(%arg3, %23) | |
%24 = amdaie.core(%tile, in : [%21, %22], out : []) { | |
linalg.fill ins(%cst : f32) outs(%subview : memref<1x1x8x8x4x4xf32, strided<[2048, 1024, 128, 16, 4, 1], offset: ?>, 2 : i32>) | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%25 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%26 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%27 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%28 = arith.extf %25 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%29 = arith.extf %26 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%30 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %28, %29, %27 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %30, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%c6 = arith.constant 6 : index | |
scf.for %arg2 = %c0 to %c6 step %c1 { | |
%21 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
%22 = affine.apply affine_map<(d0) -> (d0 * 32)>(%21) | |
%23 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, %22] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%24 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, %22, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
scf.forall (%arg3, %arg4) in (2, 2) { | |
%25 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%arg3, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%26 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %arg4, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%c2 = arith.constant 2 : index | |
%27 = arith.addi %arg3, %c2 : index | |
%tile = amdaie.tile(%arg4, %27) | |
%28 = amdaie.core(%tile, in : [%25, %26], out : []) { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c8 step %c1 { | |
scf.for %arg7 = %c0 to %c4 step %c1 { | |
%29 = vector.transfer_read %alloc_3[%c0, %c0, %arg7, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%30 = vector.transfer_read %alloc_2[%c0, %c0, %arg6, %arg7, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%31 = vector.transfer_read %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%32 = arith.extf %29 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%33 = arith.extf %30 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%34 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %32, %33, %31 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %34, %alloc_6[%arg3, %arg4, %arg6, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
} | |
%17 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, 224] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%18 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, 224, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%19 = amdaie.logicalobjectfifo.from_memref %alloc_6, {} : memref<2x2x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>> | |
scf.forall (%arg2, %arg3) in (2, 2) { | |
%21 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%arg2, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%22 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%23 = amdaie.dma_cpy_nd(%4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %19[%arg2, %arg3, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x2x8x8x4x4xf32, 2 : i32>>) | |
%c2 = arith.constant 2 : index | |
%24 = arith.addi %arg2, %c2 : index | |
%tile = amdaie.tile(%arg3, %24) | |
%25 = amdaie.core(%tile, in : [%21, %22], out : [%23]) { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c8 step %c1 { | |
scf.for %arg6 = %c0 to %c4 step %c1 { | |
%26 = vector.transfer_read %alloc_3[%c0, %c0, %arg6, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%27 = vector.transfer_read %alloc_2[%c0, %c0, %arg5, %arg6, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%28 = vector.transfer_read %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<2x2x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%29 = arith.extf %26 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%30 = arith.extf %27 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %29, %30, %28 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %31, %alloc_6[%arg2, %arg3, %arg5, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<2x2x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
%20 = amdaie.dma_cpy_nd(%10[%12, %11] [64, 64] [128, 1], %4[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xf32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_6 : memref<2x2x8x8x4x4xf32, 2 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_1 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c3 = arith.constant 3 : index | |
%c2 = arith.constant 2 : index | |
%c6 = arith.constant 6 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%tile = amdaie.tile(%c0, %c1) | |
%tile_1 = amdaie.tile(%c1, %c1) | |
%0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_2 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%tile_3 = amdaie.tile(%c0, %c1) | |
%tile_4 = amdaie.tile(%c1, %c1) | |
%1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_3} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_5 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_6 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%tile_8 = amdaie.tile(%c0, %c1) | |
%tile_9 = amdaie.tile(%c1, %c1) | |
%2 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile_8} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_10 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%tile_11 = amdaie.tile(%c0, %c1) | |
%tile_12 = amdaie.tile(%c1, %c1) | |
%3 = amdaie.logicalobjectfifo.from_memref %alloc_10, {%tile_11} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_13 = memref.alloc() : memref<1x1x8x8x4x4xf32, 2 : i32> | |
%alloc_14 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%tile_15 = amdaie.tile(%c0, %c1) | |
%tile_16 = amdaie.tile(%c1, %c1) | |
%4 = amdaie.logicalobjectfifo.from_memref %alloc_14, {%tile_15} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%5 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
%tile_17 = amdaie.tile(%c0, %c0) | |
%tile_18 = amdaie.tile(%c1, %c0) | |
%6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_17} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
memref.assume_alignment %5, 64 : memref<128x256xbf16> | |
%7 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
%tile_19 = amdaie.tile(%c0, %c0) | |
%tile_20 = amdaie.tile(%c1, %c0) | |
%8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_19} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
memref.assume_alignment %7, 64 : memref<256x128xbf16> | |
%9 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
%tile_21 = amdaie.tile(%c0, %c0) | |
%tile_22 = amdaie.tile(%c1, %c0) | |
%10 = amdaie.logicalobjectfifo.from_memref %9, {%tile_21} : memref<128x128xf32> -> !amdaie.logicalobjectfifo<memref<128x128xf32>> | |
memref.assume_alignment %9, 64 : memref<128x128xf32> | |
scf.forall (%arg0, %arg1) in (2, 2) { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1) | |
%12 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_10 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_2 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%13 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, 0] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_7 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%14 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, 0, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%tile_23 = amdaie.tile(%c1, %c3) | |
%tile_24 = amdaie.tile(%c1, %c2) | |
%15 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_24, %tile_23} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%tile_25 = amdaie.tile(%c1, %c3) | |
%tile_26 = amdaie.tile(%c1, %c2) | |
%16 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_26, %tile_25} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%tile_27 = amdaie.tile(%c1, %c3) | |
%tile_28 = amdaie.tile(%c1, %c2) | |
%17 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_28, %tile_27} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%tile_29 = amdaie.tile(%c0, %c3) | |
%tile_30 = amdaie.tile(%c0, %c2) | |
%18 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_30, %tile_29} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%tile_31 = amdaie.tile(%c1, %c3) | |
%tile_32 = amdaie.tile(%c0, %c3) | |
%19 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_32, %tile_31} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%tile_33 = amdaie.tile(%c1, %c3) | |
%tile_34 = amdaie.tile(%c0, %c3) | |
%20 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_34, %tile_33} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%tile_35 = amdaie.tile(%c1, %c3) | |
%tile_36 = amdaie.tile(%c0, %c3) | |
%21 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_36, %tile_35} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%tile_37 = amdaie.tile(%c1, %c2) | |
%tile_38 = amdaie.tile(%c0, %c2) | |
%22 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_38, %tile_37} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%23 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, %c0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%24 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, %c1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%25 = amdaie.dma_cpy_nd(%22[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %1[%c0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%tile_39 = amdaie.tile(%c0, %c2) | |
%tile_40 = amdaie.tile(%c0, %c2) | |
%26 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_40} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%27 = amdaie.core(%tile_39, in : [%25, %23], out : []) { | |
%c0_55 = arith.constant 0 : index | |
%54 = amdaie.logicalobjectfifo.access(%26, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%54 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%55 = amdaie.logicalobjectfifo.access(%22, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%56 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%57 = vector.transfer_read %55[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%58 = vector.transfer_read %56[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%59 = vector.transfer_read %54[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%60 = arith.extf %57 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%61 = arith.extf %58 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %59 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %62, %54[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%tile_41 = amdaie.tile(%c1, %c2) | |
%tile_42 = amdaie.tile(%c1, %c2) | |
%28 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_42} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%29 = amdaie.core(%tile_41, in : [%25, %24], out : []) { | |
%c0_55 = arith.constant 0 : index | |
%54 = amdaie.logicalobjectfifo.access(%28, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%54 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%55 = amdaie.logicalobjectfifo.access(%22, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%56 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%57 = vector.transfer_read %55[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%58 = vector.transfer_read %56[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%59 = vector.transfer_read %54[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%60 = arith.extf %57 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%61 = arith.extf %58 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %59 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %62, %54[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%30 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %1[%c1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%tile_43 = amdaie.tile(%c0, %c3) | |
%tile_44 = amdaie.tile(%c0, %c3) | |
%31 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_44} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%32 = amdaie.core(%tile_43, in : [%30, %23], out : []) { | |
%c0_55 = arith.constant 0 : index | |
%54 = amdaie.logicalobjectfifo.access(%31, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%54 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%55 = amdaie.logicalobjectfifo.access(%21, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%56 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%57 = vector.transfer_read %55[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%58 = vector.transfer_read %56[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%59 = vector.transfer_read %54[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%60 = arith.extf %57 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%61 = arith.extf %58 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %59 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %62, %54[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%tile_45 = amdaie.tile(%c1, %c3) | |
%tile_46 = amdaie.tile(%c1, %c3) | |
%33 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_46} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%34 = amdaie.core(%tile_45, in : [%30, %24], out : []) { | |
%c0_55 = arith.constant 0 : index | |
%54 = amdaie.logicalobjectfifo.access(%33, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%54 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%55 = amdaie.logicalobjectfifo.access(%21, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%56 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%57 = vector.transfer_read %55[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%58 = vector.transfer_read %56[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%59 = vector.transfer_read %54[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%60 = arith.extf %57 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%61 = arith.extf %58 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %59 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %62, %54[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
scf.for %arg2 = %c0 to %c6 step %c1 { | |
%54 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
%55 = affine.apply affine_map<(d0) -> (d0 * 32)>(%54) | |
%56 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, %55] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%57 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, %55, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%58 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %c0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%59 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %c1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%60 = amdaie.dma_cpy_nd(%22[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%c0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%tile_55 = amdaie.tile(%c0, %c2) | |
%tile_56 = amdaie.tile(%c0, %c2) | |
%61 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_56} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%62 = amdaie.core(%tile_55, in : [%60, %58], out : []) { | |
%c0_63 = arith.constant 0 : index | |
%70 = amdaie.logicalobjectfifo.access(%22, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%71 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%72 = amdaie.logicalobjectfifo.access(%61, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%73 = vector.transfer_read %70[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%74 = vector.transfer_read %71[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%75 = vector.transfer_read %72[%c0_63, %c0_63, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%76 = arith.extf %73 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%77 = arith.extf %74 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %76, %77, %75 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %78, %72[%c0_63, %c0_63, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%tile_57 = amdaie.tile(%c1, %c2) | |
%tile_58 = amdaie.tile(%c1, %c2) | |
%63 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_58} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%64 = amdaie.core(%tile_57, in : [%60, %59], out : []) { | |
%c0_63 = arith.constant 0 : index | |
%70 = amdaie.logicalobjectfifo.access(%22, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%71 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%72 = amdaie.logicalobjectfifo.access(%63, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%73 = vector.transfer_read %70[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%74 = vector.transfer_read %71[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%75 = vector.transfer_read %72[%c0_63, %c0_63, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%76 = arith.extf %73 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%77 = arith.extf %74 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %76, %77, %75 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %78, %72[%c0_63, %c0_63, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%65 = amdaie.dma_cpy_nd(%20[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%c1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%tile_59 = amdaie.tile(%c0, %c3) | |
%tile_60 = amdaie.tile(%c0, %c3) | |
%66 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_60} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%67 = amdaie.core(%tile_59, in : [%65, %58], out : []) { | |
%c0_63 = arith.constant 0 : index | |
%70 = amdaie.logicalobjectfifo.access(%20, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%71 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%72 = amdaie.logicalobjectfifo.access(%66, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%73 = vector.transfer_read %70[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%74 = vector.transfer_read %71[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%75 = vector.transfer_read %72[%c0_63, %c0_63, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%76 = arith.extf %73 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%77 = arith.extf %74 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %76, %77, %75 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %78, %72[%c0_63, %c0_63, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%tile_61 = amdaie.tile(%c1, %c3) | |
%tile_62 = amdaie.tile(%c1, %c3) | |
%68 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_62} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%69 = amdaie.core(%tile_61, in : [%65, %59], out : []) { | |
%c0_63 = arith.constant 0 : index | |
%70 = amdaie.logicalobjectfifo.access(%20, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%71 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%72 = amdaie.logicalobjectfifo.access(%68, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%73 = vector.transfer_read %70[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%74 = vector.transfer_read %71[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%75 = vector.transfer_read %72[%c0_63, %c0_63, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%76 = arith.extf %73 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%77 = arith.extf %74 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %76, %77, %75 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %78, %72[%c0_63, %c0_63, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} | |
%35 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, 224] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%36 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, 224, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%tile_47 = amdaie.tile(%c1, %c3) | |
%37 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_47} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%tile_48 = amdaie.tile(%c1, %c2) | |
%38 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_48} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%tile_49 = amdaie.tile(%c0, %c3) | |
%39 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_49} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%tile_50 = amdaie.tile(%c0, %c2) | |
%40 = amdaie.logicalobjectfifo.from_memref %alloc_13, {%tile_50} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%41 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %c0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%42 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %c1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%43 = amdaie.dma_cpy_nd(%22[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%c0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%44 = amdaie.dma_cpy_nd(%4[%c0, %c0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %40[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%tile_51 = amdaie.tile(%c0, %c2) | |
%45 = amdaie.core(%tile_51, in : [%43, %41], out : [%44]) { | |
%c0_55 = arith.constant 0 : index | |
%54 = amdaie.logicalobjectfifo.access(%22, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%55 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%56 = amdaie.logicalobjectfifo.access(%40, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%57 = vector.transfer_read %54[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%58 = vector.transfer_read %55[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%59 = vector.transfer_read %56[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%60 = arith.extf %57 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%61 = arith.extf %58 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %59 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %62, %56[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%46 = amdaie.dma_cpy_nd(%4[%c0, %c1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %38[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%tile_52 = amdaie.tile(%c1, %c2) | |
%47 = amdaie.core(%tile_52, in : [%43, %42], out : [%46]) { | |
%c0_55 = arith.constant 0 : index | |
%54 = amdaie.logicalobjectfifo.access(%22, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%55 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%56 = amdaie.logicalobjectfifo.access(%38, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%57 = vector.transfer_read %54[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%58 = vector.transfer_read %55[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%59 = vector.transfer_read %56[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%60 = arith.extf %57 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%61 = arith.extf %58 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %59 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %62, %56[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%48 = amdaie.dma_cpy_nd(%19[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%c1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%49 = amdaie.dma_cpy_nd(%4[%c1, %c0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %39[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%tile_53 = amdaie.tile(%c0, %c3) | |
%50 = amdaie.core(%tile_53, in : [%48, %41], out : [%49]) { | |
%c0_55 = arith.constant 0 : index | |
%54 = amdaie.logicalobjectfifo.access(%19, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%55 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%56 = amdaie.logicalobjectfifo.access(%39, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%57 = vector.transfer_read %54[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%58 = vector.transfer_read %55[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%59 = vector.transfer_read %56[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%60 = arith.extf %57 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%61 = arith.extf %58 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %59 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %62, %56[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%51 = amdaie.dma_cpy_nd(%4[%c1, %c1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %37[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%tile_54 = amdaie.tile(%c1, %c3) | |
%52 = amdaie.core(%tile_54, in : [%48, %42], out : [%51]) { | |
%c0_55 = arith.constant 0 : index | |
%54 = amdaie.logicalobjectfifo.access(%19, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%55 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%56 = amdaie.logicalobjectfifo.access(%37, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%57 = vector.transfer_read %54[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%58 = vector.transfer_read %55[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%59 = vector.transfer_read %56[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%60 = arith.extf %57 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%61 = arith.extf %58 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %61, %59 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %62, %56[%c0_55, %c0_55, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%53 = amdaie.dma_cpy_nd(%10[%12, %11] [64, 64] [128, 1], %4[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xf32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_14 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_10 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_7 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_6 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_5 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_13 : memref<1x1x8x8x4x4xf32, 2 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before Canonicalizer (canonicalize) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c3 = arith.constant 3 : index | |
%c2 = arith.constant 2 : index | |
%c6 = arith.constant 6 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%tile = amdaie.tile(%c0, %c1) | |
%0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_1 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%3 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xf32, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%4 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%5 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
%tile_8 = amdaie.tile(%c0, %c0) | |
%6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_8} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
memref.assume_alignment %5, 64 : memref<128x256xbf16> | |
%7 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
%8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_8} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
memref.assume_alignment %7, 64 : memref<256x128xbf16> | |
%9 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
%10 = amdaie.logicalobjectfifo.from_memref %9, {%tile_8} : memref<128x128xf32> -> !amdaie.logicalobjectfifo<memref<128x128xf32>> | |
memref.assume_alignment %9, 64 : memref<128x128xf32> | |
scf.forall (%arg0, %arg1) in (2, 2) { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1) | |
%12 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_1 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%13 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, 0] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%14 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, 0, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%tile_9 = amdaie.tile(%c1, %c3) | |
%tile_10 = amdaie.tile(%c1, %c2) | |
%15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_10, %tile_9} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%tile_11 = amdaie.tile(%c0, %c3) | |
%tile_12 = amdaie.tile(%c0, %c2) | |
%16 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_12, %tile_11} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%17 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_11, %tile_9} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_12, %tile_10} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%19 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, %c0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%20 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, %c1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%21 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %1[%c0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%22 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_12} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%23 = amdaie.core(%tile_12, in : [%21, %19], out : []) { | |
%46 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%46 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%47 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %47[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %48[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %46[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %46[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%24 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_10} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%25 = amdaie.core(%tile_10, in : [%21, %20], out : []) { | |
%46 = amdaie.logicalobjectfifo.access(%24, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%46 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%47 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %47[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %48[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %46[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %46[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%26 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %1[%c1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%27 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_11} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%28 = amdaie.core(%tile_11, in : [%26, %19], out : []) { | |
%46 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%46 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%47 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %47[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %48[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %46[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %46[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%29 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_9} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%30 = amdaie.core(%tile_9, in : [%26, %20], out : []) { | |
%46 = amdaie.logicalobjectfifo.access(%29, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%46 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%47 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %47[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %48[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %46[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %46[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
scf.for %arg2 = %c0 to %c6 step %c1 { | |
%46 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2) | |
%47 = affine.apply affine_map<(d0) -> (d0 * 32)>(%46) | |
%48 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, %47] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%49 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, %47, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%50 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %c0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%51 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %c1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%52 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%c0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%53 = amdaie.core(%tile_12, in : [%52, %50], out : []) { | |
%58 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%59 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%60 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%61 = vector.transfer_read %58[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%62 = vector.transfer_read %59[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%63 = vector.transfer_read %60[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%64 = arith.extf %61 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%65 = arith.extf %62 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %64, %65, %63 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %66, %60[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%54 = amdaie.core(%tile_10, in : [%52, %51], out : []) { | |
%58 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%59 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%60 = amdaie.logicalobjectfifo.access(%24, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%61 = vector.transfer_read %58[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%62 = vector.transfer_read %59[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%63 = vector.transfer_read %60[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%64 = arith.extf %61 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%65 = arith.extf %62 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %64, %65, %63 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %66, %60[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%55 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%c1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%56 = amdaie.core(%tile_11, in : [%55, %50], out : []) { | |
%58 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%59 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%60 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%61 = vector.transfer_read %58[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%62 = vector.transfer_read %59[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%63 = vector.transfer_read %60[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%64 = arith.extf %61 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%65 = arith.extf %62 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %64, %65, %63 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %66, %60[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%57 = amdaie.core(%tile_9, in : [%55, %51], out : []) { | |
%58 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%59 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%60 = amdaie.logicalobjectfifo.access(%29, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%61 = vector.transfer_read %58[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%62 = vector.transfer_read %59[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%63 = vector.transfer_read %60[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%64 = arith.extf %61 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%65 = arith.extf %62 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %64, %65, %63 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %66, %60[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} | |
%31 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, 224] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%32 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, 224, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%33 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %c0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%34 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, %c1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%35 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%c0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%36 = amdaie.dma_cpy_nd(%4[%c0, %c0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %22[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%37 = amdaie.core(%tile_12, in : [%35, %33], out : [%36]) { | |
%46 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%47 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%22, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %47[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %48[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %48[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%38 = amdaie.dma_cpy_nd(%4[%c0, %c1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %24[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%39 = amdaie.core(%tile_10, in : [%35, %34], out : [%38]) { | |
%46 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%47 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%24, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %47[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %48[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %48[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%40 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[%c1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%41 = amdaie.dma_cpy_nd(%4[%c1, %c0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %27[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%42 = amdaie.core(%tile_11, in : [%40, %33], out : [%41]) { | |
%46 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%47 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%27, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %47[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %48[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %48[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%43 = amdaie.dma_cpy_nd(%4[%c1, %c1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %29[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%44 = amdaie.core(%tile_9, in : [%40, %34], out : [%43]) { | |
%46 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%47 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%29, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %47[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %48[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %48[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%45 = amdaie.dma_cpy_nd(%10[%12, %11] [64, 64] [128, 1], %4[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xf32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_1 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xf32, 2 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before AMDAIESplitLogicalObjFifosForConnectionReuse (iree-amdaie-split-logical-objectfifos-for-connection-reuse) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c3 = arith.constant 3 : index | |
%c2 = arith.constant 2 : index | |
%c6 = arith.constant 6 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%tile = amdaie.tile(%c0, %c1) | |
%0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_1 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%3 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xf32, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%4 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%5 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
%tile_8 = amdaie.tile(%c0, %c0) | |
%6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_8} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
memref.assume_alignment %5, 64 : memref<128x256xbf16> | |
%7 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
%8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_8} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
memref.assume_alignment %7, 64 : memref<256x128xbf16> | |
%9 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
%10 = amdaie.logicalobjectfifo.from_memref %9, {%tile_8} : memref<128x128xf32> -> !amdaie.logicalobjectfifo<memref<128x128xf32>> | |
memref.assume_alignment %9, 64 : memref<128x128xf32> | |
scf.forall (%arg0, %arg1) in (2, 2) { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1) | |
%12 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_1 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%13 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, 0] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%14 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, 0, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%tile_9 = amdaie.tile(%c1, %c3) | |
%tile_10 = amdaie.tile(%c1, %c2) | |
%15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_10, %tile_9} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%tile_11 = amdaie.tile(%c0, %c3) | |
%tile_12 = amdaie.tile(%c0, %c2) | |
%16 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_12, %tile_11} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%17 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_11, %tile_9} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_12, %tile_10} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%19 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%20 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %0[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%21 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %1[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%22 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_12} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%23 = amdaie.core(%tile_12, in : [%21, %19], out : []) { | |
%46 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%46 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%47 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %47[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %48[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %46[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %46[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%24 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_10} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%25 = amdaie.core(%tile_10, in : [%21, %20], out : []) { | |
%46 = amdaie.logicalobjectfifo.access(%24, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%46 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%47 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %47[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %48[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %46[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %46[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%26 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %1[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%27 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_11} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%28 = amdaie.core(%tile_11, in : [%26, %19], out : []) { | |
%46 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%46 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%47 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %47[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %48[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %46[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %46[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%29 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_9} : memref<1x1x8x8x4x4xf32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> | |
%30 = amdaie.core(%tile_9, in : [%26, %20], out : []) { | |
%46 = amdaie.logicalobjectfifo.access(%29, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
linalg.fill ins(%cst : f32) outs(%46 : memref<1x1x8x8x4x4xf32, 2 : i32>) | |
%47 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %47[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %48[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %46[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %46[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
scf.for %arg2 = %c0 to %c6 step %c1 { | |
%46 = affine.apply affine_map<(d0) -> (d0 * 32 + 32)>(%arg2) | |
%47 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, %46] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%48 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, %46, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%49 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%50 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%51 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%52 = amdaie.core(%tile_12, in : [%51, %49], out : []) { | |
%57 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%58 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%59 = amdaie.logicalobjectfifo.access(%22, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%60 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%61 = vector.transfer_read %58[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%62 = vector.transfer_read %59[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%63 = arith.extf %60 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%64 = arith.extf %61 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %63, %64, %62 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %65, %59[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%53 = amdaie.core(%tile_10, in : [%51, %50], out : []) { | |
%57 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%58 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%59 = amdaie.logicalobjectfifo.access(%24, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%60 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%61 = vector.transfer_read %58[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%62 = vector.transfer_read %59[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%63 = arith.extf %60 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%64 = arith.extf %61 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %63, %64, %62 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %65, %59[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%54 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%55 = amdaie.core(%tile_11, in : [%54, %49], out : []) { | |
%57 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%58 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%59 = amdaie.logicalobjectfifo.access(%27, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%60 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%61 = vector.transfer_read %58[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%62 = vector.transfer_read %59[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%63 = arith.extf %60 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%64 = arith.extf %61 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %63, %64, %62 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %65, %59[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%56 = amdaie.core(%tile_9, in : [%54, %50], out : []) { | |
%57 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%58 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%59 = amdaie.logicalobjectfifo.access(%29, None) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c8 step %c1 { | |
scf.for %arg5 = %c0 to %c4 step %c1 { | |
%60 = vector.transfer_read %57[%c0, %c0, %arg5, %arg3, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%61 = vector.transfer_read %58[%c0, %c0, %arg4, %arg5, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%62 = vector.transfer_read %59[%c0, %c0, %arg4, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%63 = arith.extf %60 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%64 = arith.extf %61 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %63, %64, %62 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %65, %59[%c0, %c0, %arg4, %arg3, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
} | |
%31 = amdaie.dma_cpy_nd(%3[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, 224] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
%32 = amdaie.dma_cpy_nd(%2[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, 224, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%33 = amdaie.dma_cpy_nd(%16[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%34 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %2[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>) | |
%35 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%36 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %22[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%37 = amdaie.core(%tile_12, in : [%35, %33], out : [%36]) { | |
%46 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%47 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%22, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %47[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %48[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %48[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%38 = amdaie.dma_cpy_nd(%4[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %24[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%39 = amdaie.core(%tile_10, in : [%35, %34], out : [%38]) { | |
%46 = amdaie.logicalobjectfifo.access(%18, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%47 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%24, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %47[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %48[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %48[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%40 = amdaie.dma_cpy_nd(%17[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %3[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>) | |
%41 = amdaie.dma_cpy_nd(%4[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %27[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%42 = amdaie.core(%tile_11, in : [%40, %33], out : [%41]) { | |
%46 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%47 = amdaie.logicalobjectfifo.access(%16, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%27, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %47[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %48[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %48[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%43 = amdaie.dma_cpy_nd(%4[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %29[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>>) | |
%44 = amdaie.core(%tile_9, in : [%40, %34], out : [%43]) { | |
%46 = amdaie.logicalobjectfifo.access(%17, Read) : !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> -> memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%47 = amdaie.logicalobjectfifo.access(%15, Read) : !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> -> memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%48 = amdaie.logicalobjectfifo.access(%29, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xf32, 2 : i32>> -> memref<1x1x8x8x4x4xf32, 2 : i32> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%49 = vector.transfer_read %46[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 2 : i32>, vector<1x1x1x1x4x8xbf16> | |
%50 = vector.transfer_read %47[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 2 : i32>, vector<1x1x1x1x8x4xbf16> | |
%51 = vector.transfer_read %48[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x8x4x4xf32, 2 : i32>, vector<1x1x1x1x4x4xf32> | |
%52 = arith.extf %49 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%53 = arith.extf %50 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %53, %51 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %54, %48[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, true, true, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 2 : i32> | |
} | |
} | |
} | |
amdaie.end | |
} | |
%45 = amdaie.dma_cpy_nd(%10[%12, %11] [64, 64] [128, 1], %4[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xf32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>>) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
memref.dealloc %alloc_7 : memref<2x2x32x32xf32, 1 : i32> | |
memref.dealloc %alloc_5 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_4 : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_3 : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
memref.dealloc %alloc_2 : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
memref.dealloc %alloc_1 : memref<2x1x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc : memref<1x2x32x32xbf16, 1 : i32> | |
memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xf32, 2 : i32> | |
return | |
} | |
} | |
// -----// IR Dump Before CSE (cse) //----- // | |
module { | |
func.func @matmul_dispatch_0_matmul_128x128x256_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} { | |
%c3 = arith.constant 3 : index | |
%c2 = arith.constant 2 : index | |
%c6 = arith.constant 6 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%cst_0 = arith.constant 0.000000e+00 : bf16 | |
%alloc = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%tile = amdaie.tile(%c0, %c1) | |
%0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_1 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_2 = memref.alloc() : memref<1x1x8x4x8x4xbf16, 2 : i32> | |
%alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xbf16, 2 : i32> | |
%alloc_4 = memref.alloc() : memref<1x2x32x32xbf16, 1 : i32> | |
%2 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile} : memref<1x2x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>> | |
%alloc_5 = memref.alloc() : memref<2x1x32x32xbf16, 1 : i32> | |
%3 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile} : memref<2x1x32x32xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>> | |
%alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xf32, 2 : i32> | |
%alloc_7 = memref.alloc() : memref<2x2x32x32xf32, 1 : i32> | |
%4 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile} : memref<2x2x32x32xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xf32, 1 : i32>> | |
%5 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<128x256xbf16> | |
%tile_8 = amdaie.tile(%c0, %c0) | |
%6 = amdaie.logicalobjectfifo.from_memref %5, {%tile_8} : memref<128x256xbf16> -> !amdaie.logicalobjectfifo<memref<128x256xbf16>> | |
memref.assume_alignment %5, 64 : memref<128x256xbf16> | |
%7 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<256x128xbf16> | |
%8 = amdaie.logicalobjectfifo.from_memref %7, {%tile_8} : memref<256x128xbf16> -> !amdaie.logicalobjectfifo<memref<256x128xbf16>> | |
memref.assume_alignment %7, 64 : memref<256x128xbf16> | |
%9 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<128x128xf32> | |
%10 = amdaie.logicalobjectfifo.from_memref %9, {%tile_8} : memref<128x128xf32> -> !amdaie.logicalobjectfifo<memref<128x128xf32>> | |
memref.assume_alignment %9, 64 : memref<128x128xf32> | |
scf.forall (%arg0, %arg1) in (2, 2) { | |
%11 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg1) | |
%12 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_5 : memref<2x1x32x32xbf16, 1 : i32>) outs(%alloc_1 : memref<2x1x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%13 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, %12, 0] [2, 1, 32, 32] [8192, 32, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x256xbf16>>) | |
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%alloc_4 : memref<1x2x32x32xbf16, 1 : i32>) outs(%alloc : memref<1x2x32x32xbf16, 1 : i32>) { | |
^bb0(%in: bf16, %out: bf16): | |
linalg.yield %in : bf16 | |
} | |
%14 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1], %8[0, 0, 0, %11] [1, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xbf16, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xbf16>>) | |
%tile_9 = amdaie.tile(%c1, %c3) | |
%tile_10 = amdaie.tile(%c1, %c2) | |
%15 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_10, %tile_9} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%tile_11 = amdaie.tile(%c0, %c3) | |
%tile_12 = amdaie.tile(%c0, %c2) | |
%16 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_12, %tile_11} : memref<1x1x8x4x8x4xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xbf16, 2 : i32>> | |
%17 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_11, %tile_9} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xbf16, 2 : i32>> | |
%18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_12, %tile_10} : memref<1x1x4x8x4x8xbf16, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment