Skip to content

Instantly share code, notes, and snippets.

@Abhishek-Varma
Last active March 7, 2024 14:02
Show Gist options
  • Save Abhishek-Varma/964268de93971d8f971e09a938c5ae29 to your computer and use it in GitHub Desktop.
Save Abhishek-Varma/964268de93971d8f971e09a938c5ae29 to your computer and use it in GitHub Desktop.
// -----// IR Dump Before TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
}
// -----// IR Dump Before TypePropagation (iree-codegen-type-propagation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before BubbleUpOrdinalOps (iree-codegen-bubble-up-ordinal-ops) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before DecomposeSoftmax (iree-codegen-decompose-softmax) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before MaterializeUserConfigs (iree-codegen-materialize-user-configs) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
}
// -----// IR Dump Before AMDAIELoweringStrategy (iree-amdaie-lowering-strategy) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
}
// -----// IR Dump Before AMDAIELowerExecutableTarget (iree-amdaie-lower-executable-target) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%9 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%9 : tensor<64x64xi32>) -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = tensor.empty() : tensor<1x1x64x512xi32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %8 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%9 = tensor.empty() : tensor<1x1x64x512xi32>
%10 = tensor.empty() : tensor<1x1x512x64xi32>
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %10 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%11 = tensor.empty() : tensor<1x1x64x64xi32>
%pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_3 : tensor<1x1x64x64xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_4: i32, %out: i32):
%13 = arith.muli %in, %in_4 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x64x64xi32>
%unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = tensor.empty() : tensor<1x1x64x512xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %9 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%10 = tensor.empty() : tensor<1x1x64x512xi32>
%11 = tensor.empty() : tensor<1x1x512x64xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %12 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%13 = tensor.empty() : tensor<1x1x64x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%14 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%pack_5 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_3 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_5 : tensor<1x1x64x64xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_6: i32, %out: i32):
%16 = arith.muli %in, %in_6 : i32
%17 = arith.addi %out, %16 : i32
linalg.yield %17 : i32
} -> tensor<1x1x64x64xi32>
%unpack = tensor.unpack %15 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = tensor.empty() : tensor<1x1x64x512xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %9 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%10 = tensor.empty() : tensor<1x1x64x512xi32>
%11 = tensor.empty() : tensor<1x1x512x64xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %12 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%13 = tensor.empty() : tensor<1x1x64x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%14 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%pack_5 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_3 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_5 : tensor<1x1x64x64xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_6: i32, %out: i32):
%17 = arith.muli %in, %in_6 : i32
%18 = arith.addi %out, %17 : i32
linalg.yield %18 : i32
} -> tensor<1x1x64x64xi32>
%16 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %pack_5) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_6 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_7 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_8 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_6, %extracted_slice_7 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_8 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_9: i32, %out: i32):
%18 = arith.muli %in, %in_9 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %16 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %8 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %9 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%10 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
%12 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %11) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%14 = arith.muli %in, %in_8 : i32
%15 = arith.addi %out, %14 : i32
linalg.yield %15 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
%11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %10) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEFuseFillIntoForall (iree-amdaie-fuse-fill-into-forall) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
%11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %10) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
%11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%extracted_slice_8 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%12 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_9: i32, %out: i32):
%14 = arith.muli %in, %in_9 : i32
%15 = arith.addi %out, %14 : i32
linalg.yield %15 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
%13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%pack_10 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_10 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%19 = arith.muli %in, %in_12 : i32
%20 = arith.addi %out, %19 : i32
linalg.yield %20 : i32
} -> tensor<1x1x8x8x4x4xi32>
%unpack_11 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
%13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%18 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%pack_11 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_11 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_13: i32, %out: i32):
%20 = arith.muli %in, %in_13 : i32
%21 = arith.addi %out, %20 : i32
linalg.yield %21 : i32
} -> tensor<1x1x8x8x4x4xi32>
%unpack_12 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
%13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%18 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%pack_11 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_11 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%21 = arith.muli %in, %in_14 : i32
%22 = arith.addi %out, %21 : i32
linalg.yield %22 : i32
} -> tensor<1x1x8x8x4x4xi32>
%c0_12 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%20 = scf.for %arg6 = %c0_12 to %c64 step %c4 iter_args(%arg7 = %pack_11) -> (tensor<1x1x8x8x4x4xi32>) {
%extracted_slice_14 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%extracted_slice_15 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%extracted_slice_16 = tensor.extract_slice %arg7[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_14, %extracted_slice_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_17: i32, %out: i32):
%22 = arith.muli %in, %in_17 : i32
%23 = arith.addi %out, %22 : i32
linalg.yield %23 : i32
} -> tensor<1x1x8x8x4x4xi32>
%inserted_slice = tensor.insert_slice %21 into %arg7[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<1x1x8x8x4x4xi32>
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
}
%unpack_13 = tensor.unpack %20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %12 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%13 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %13 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%14 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%16 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %15) -> (tensor<1x1x8x8x4x4xi32>) {
%extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%18 = arith.muli %in, %in_14 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_11 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%17 = arith.muli %in, %in_14 : i32
%18 = arith.addi %out, %17 : i32
linalg.yield %18 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %16 : tensor<1x1x8x8x4x4xi32>
}
%unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEFusePackIntoForLoop (iree-amdaie-fuse-pack-into-for) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%17 = arith.muli %in, %in_14 : i32
%18 = arith.addi %out, %17 : i32
linalg.yield %18 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %16 : tensor<1x1x8x8x4x4xi32>
}
%unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_13 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_13 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_15 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_16 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_17 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%pack_18 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%extracted_slice_19 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_14, %pack_18 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_20: i32, %out: i32):
%19 = arith.muli %in, %in_20 : i32
%20 = arith.addi %out, %19 : i32
linalg.yield %20 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %18 : tensor<1x1x8x8x4x4xi32>
}
%unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%19 = arith.muli %in, %in_16 : i32
%20 = arith.addi %out, %19 : i32
linalg.yield %20 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %18 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%19 = arith.muli %in, %in_16 : i32
%20 = arith.addi %out, %19 : i32
linalg.yield %20 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %18 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%18 = arith.muli %in, %in_16 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIELowerToUKernels (iree-amdaie-lower-to-ukernels) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%alloc_12 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%17 = bufferization.to_tensor %alloc_12 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
%pack_13 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_14 = tensor.extract_slice %extracted_slice_6[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_15 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%alloc_16 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%18 = bufferization.to_tensor %alloc_16 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
%pack_17 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_18: i32, %out: i32):
%20 = arith.muli %in, %in_18 : i32
%21 = arith.addi %out, %20 : i32
linalg.yield %21 : i32
} -> tensor<1x1x8x8x4x4xi32>
memref.dealloc %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_16 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %19 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
%14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%18 = arith.muli %in, %in_16 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>> -> tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
%14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%18 = arith.muli %in, %in_16 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>> -> tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
%14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%18 = arith.muli %in, %in_16 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
%3 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %alloc_8) -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_10 = memref.subview %subview_5[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_12 = memref.subview %subview_6[0, 0, %4, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%5 = arith.muli %in, %in_14 : i32
%6 = arith.addi %out, %5 : i32
linalg.yield %6 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %3 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
%subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
%subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
%3 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %alloc_8) -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_10 = memref.subview %subview_5[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_12 = memref.subview %subview_6[0, 0, %4, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%5 = arith.muli %in, %in_14 : i32
%6 = arith.addi %out, %5 : i32
linalg.yield %6 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %3 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
%subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
%subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_10 = memref.subview %subview_5[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_12 = memref.subview %subview_6[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%4 = arith.muli %in, %in_14 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
%subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
%subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%4 = arith.muli %in, %in_12 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_6 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
// -----// IR Dump Before CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%4 = arith.muli %in, %in_12 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
// -----// IR Dump Before LowerUKernelOpsToCalls (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%4 = arith.muli %in, %in_12 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%4 = arith.muli %in, %in_12 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %alloc_2[0, 0, %4, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%5 = arith.muli %in, %in_10 : i32
%6 = arith.addi %out, %5 : i32
linalg.yield %6 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
}
// -----// IR Dump Before CSE (cse) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %alloc_2[0, 0, %4, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%5 = arith.muli %in, %in_10 : i32
%6 = arith.addi %out, %5 : i32
linalg.yield %6 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
}
// -----// IR Dump Before AMDAIELowerWorkgroupCount (iree-amdaie-lower-workgroup-count) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
}
// -----// IR Dump Before EraseHALDescriptorTypeFromMemRef (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before AMDAIEBridgeToAIR (iree-amdaie-bridge-to-air) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before AMDAIEDecomposeLinalgExtPackUnPackToAIR (iree-amdaie-decompose-pack-unpack-to-air) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c2048 = arith.constant 2048 : index
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg4]
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.reduce
}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
scf.reduce
}
return
}
}
// -----// IR Dump Before ParallelToHerd (air-par-to-herd) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c2048 = arith.constant 2048 : index
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_2[] [] [], %subview_0[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) {
%subview_5 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg4]
%subview_8 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%expand_shape = memref.expand_shape %subview_8 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
%transpose_10 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_9[] [] [], %transpose_10[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
%subview_11 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%expand_shape_13 = memref.expand_shape %subview_11 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
%transpose_14 = memref.transpose %expand_shape_13 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_12[] [] [], %transpose_14[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_15: i32, %out: i32):
%4 = arith.muli %in, %in_15 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%transpose_7 = memref.transpose %alloc_6 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
air.dma_memcpy_nd (%subview_5[] [] [], %transpose_7[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.reduce
}
%subview_4 = memref.subview %alloc_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
%transpose = memref.transpose %subview_4 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
air.dma_memcpy_nd (%subview_1[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
scf.reduce
}
return
}
}
// -----// IR Dump Before ParallelToLaunch (air-par-to-launch) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c2048 = arith.constant 2048 : index
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_2[] [] [], %subview_0[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%c2 = arith.constant 2 : index
%c0_4 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2_5 = arith.constant 2 : index
%c0_6 = arith.constant 0 : index
%c1_7 = arith.constant 1 : index
%c2_8 = arith.constant 2 : index
%c2_9 = arith.constant 2 : index
air.herd @herd_0 tile (%arg2, %arg3) in (%arg4=%c2_8, %arg5=%c2_9) args(%arg6=%alloc_3, %arg7=%alloc, %arg8=%alloc_2) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c0_i32_11 = arith.constant 0 : i32
%c0_12 = arith.constant 0 : index
%c64_13 = arith.constant 64 : index
%c4_14 = arith.constant 4 : index
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
%4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
%subview_15 = memref.subview %arg6[0, 0, %3, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_16 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32_11 : i32) outs(%alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg9 = %c0_12 to %c64_13 step %c4_14 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
%subview_18 = memref.subview %arg7[0, 0, %3, %5] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_19 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%expand_shape = memref.expand_shape %subview_18 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
%transpose_20 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_19[] [] [], %transpose_20[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
%subview_21 = memref.subview %arg8[0, 0, %5, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_22 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%expand_shape_23 = memref.expand_shape %subview_21 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
%transpose_24 = memref.transpose %expand_shape_23 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_22[] [] [], %transpose_24[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_19, %alloc_22 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_25: i32, %out: i32):
%6 = arith.muli %in, %in_25 : i32
%7 = arith.addi %out, %6 : i32
linalg.yield %7 : i32
}
memref.dealloc %alloc_19 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_22 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%transpose_17 = memref.transpose %alloc_16 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
air.dma_memcpy_nd (%subview_15[] [] [], %transpose_17[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
memref.dealloc %alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
%subview_10 = memref.subview %alloc_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
%transpose = memref.transpose %subview_10 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
air.dma_memcpy_nd (%subview_1[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
scf.reduce
}
return
}
}
// -----// IR Dump Before CopyToDma (air-copy-to-dma) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c2048 = arith.constant 2048 : index
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
%c32_0 = arith.constant 32 : index
%c0_1 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_2 = arith.constant 32 : index
%c0_3 = arith.constant 0 : index
%c1_4 = arith.constant 1 : index
%c32_5 = arith.constant 32 : index
%c32_6 = arith.constant 32 : index
air.launch (%arg0, %arg1) in (%arg2=%c32_5, %arg3=%c32_6) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
air.segment @segment_0 args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg2, %arg10=%arg3, %arg11=%arg4, %arg12=%arg5, %arg13=%arg6) : index, index, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
%3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg7)
%4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg8)
%subview = memref.subview %arg11[%3, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_7 = memref.subview %arg12[0, %4] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_8 = memref.subview %arg13[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
%alloc_9 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_9[] [] [], %subview_7[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
%alloc_10 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%c2 = arith.constant 2 : index
%c0_11 = arith.constant 0 : index
%c1_12 = arith.constant 1 : index
%c2_13 = arith.constant 2 : index
%c0_14 = arith.constant 0 : index
%c1_15 = arith.constant 1 : index
%c2_16 = arith.constant 2 : index
%c2_17 = arith.constant 2 : index
air.herd @herd_0 tile (%arg14, %arg15) in (%arg16=%c2_16, %arg17=%c2_17) args(%arg18=%alloc_10, %arg19=%alloc, %arg20=%alloc_9) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c0_i32_19 = arith.constant 0 : i32
%c0_20 = arith.constant 0 : index
%c64_21 = arith.constant 64 : index
%c4_22 = arith.constant 4 : index
%5 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg14)
%6 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg15)
%subview_23 = memref.subview %arg18[0, 0, %5, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_24 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32_19 : i32) outs(%alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg21 = %c0_20 to %c64_21 step %c4_22 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg21]
%subview_26 = memref.subview %arg19[0, 0, %5, %7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_27 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%expand_shape = memref.expand_shape %subview_26 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
%transpose_28 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_27[] [] [], %transpose_28[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
%subview_29 = memref.subview %arg20[0, 0, %7, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_30 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%expand_shape_31 = memref.expand_shape %subview_29 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
%transpose_32 = memref.transpose %expand_shape_31 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_30[] [] [], %transpose_32[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_27, %alloc_30 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_33: i32, %out: i32):
%8 = arith.muli %in, %in_33 : i32
%9 = arith.addi %out, %8 : i32
linalg.yield %9 : i32
}
memref.dealloc %alloc_27 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_30 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%transpose_25 = memref.transpose %alloc_24 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
air.dma_memcpy_nd (%subview_23[] [] [], %transpose_25[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
memref.dealloc %alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
%subview_18 = memref.subview %alloc_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
%transpose = memref.transpose %subview_18 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
air.dma_memcpy_nd (%subview_8[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_9 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_10 : memref<1x1x64x64xi32, 1 : i32>
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
air.segment @segment_0 args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg2, %arg10=%arg3, %arg11=%arg4, %arg12=%arg5, %arg13=%arg6) : index, index, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
%c2 = arith.constant 2 : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
%subview = memref.subview %arg11[%3, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %arg12[0, %4] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %arg13[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%c0_2 = arith.constant 0 : index
%c512 = arith.constant 512 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c512_3 = arith.constant 512 : index
air.dma_memcpy_nd (%alloc[] [] [], %arg11[%3, %c0_2] [%c64, %c512_3] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%alloc_4 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%c0_5 = arith.constant 0 : index
%c2048 = arith.constant 2048 : index
%c1_6 = arith.constant 1 : index
%c512_7 = arith.constant 512 : index
%c64_8 = arith.constant 64 : index
air.dma_memcpy_nd (%alloc_4[] [] [], %arg12[%c0_5, %4] [%c512_7, %c64_8] [%c2048, %c1_6]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%alloc_9 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.herd @herd_0 tile (%arg14, %arg15) in (%arg16=%c2, %arg17=%c2) args(%arg18=%alloc_9, %arg19=%alloc, %arg20=%alloc_4) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c0_i32 = arith.constant 0 : i32
%c0_26 = arith.constant 0 : index
%c64_27 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg14]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg15]
%subview_28 = memref.subview %arg18[0, 0, %5, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_29 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg21 = %c0_26 to %c64_27 step %c4 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg21]
%subview_55 = memref.subview %arg19[0, 0, %5, %7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_56 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%expand_shape = memref.expand_shape %subview_55 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
%transpose_57 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
%c0_58 = arith.constant 0 : index
%c0_59 = arith.constant 0 : index
%c32768 = arith.constant 32768 : index
%c32768_60 = arith.constant 32768 : index
%c8_61 = arith.constant 8 : index
%c2048_62 = arith.constant 2048 : index
%c512_63 = arith.constant 512 : index
%c1_64 = arith.constant 1 : index
%c1_65 = arith.constant 1 : index
%c1_66 = arith.constant 1 : index
%c4_67 = arith.constant 4 : index
%c8_68 = arith.constant 8 : index
%c4_69 = arith.constant 4 : index
%c8_70 = arith.constant 8 : index
%c0_71 = arith.constant 0 : index
%c0_72 = arith.constant 0 : index
air.dma_memcpy_nd (%alloc_56[] [] [], %arg19[%c0_72, %c0_71, %c0_58, %c0_59, %5, %7] [%c1_65, %c1_66, %c4_67, %c8_68, %c4_69, %c8_70] [%c32768, %c32768_60, %c8_61, %c2048_62, %c512_63, %c1_64]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%subview_73 = memref.subview %arg20[0, 0, %7, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_74 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%expand_shape_75 = memref.expand_shape %subview_73 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
%transpose_76 = memref.transpose %expand_shape_75 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
%c0_77 = arith.constant 0 : index
%c0_78 = arith.constant 0 : index
%c32768_79 = arith.constant 32768 : index
%c32768_80 = arith.constant 32768 : index
%c4_81 = arith.constant 4 : index
%c512_82 = arith.constant 512 : index
%c64_83 = arith.constant 64 : index
%c1_84 = arith.constant 1 : index
%c1_85 = arith.constant 1 : index
%c1_86 = arith.constant 1 : index
%c8_87 = arith.constant 8 : index
%c4_88 = arith.constant 4 : index
%c8_89 = arith.constant 8 : index
%c4_90 = arith.constant 4 : index
%c0_91 = arith.constant 0 : index
%c0_92 = arith.constant 0 : index
air.dma_memcpy_nd (%alloc_74[] [] [], %arg20[%c0_92, %c0_91, %c0_77, %c0_78, %7, %6] [%c1_85, %c1_86, %c8_87, %c4_88, %c8_89, %c4_90] [%c32768_79, %c32768_80, %c4_81, %c512_82, %c64_83, %c1_84]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_56, %alloc_74 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_93: i32, %out: i32):
%8 = arith.muli %in, %in_93 : i32
%9 = arith.addi %out, %8 : i32
linalg.yield %9 : i32
}
memref.dealloc %alloc_56 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_74 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%transpose_30 = memref.transpose %alloc_29 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
%c0_31 = arith.constant 0 : index
%c0_32 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1024_33 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%c4_34 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c1_35 = arith.constant 1 : index
%c1_36 = arith.constant 1 : index
%c1_37 = arith.constant 1 : index
%c8 = arith.constant 8 : index
%c4_38 = arith.constant 4 : index
%c8_39 = arith.constant 8 : index
%c4_40 = arith.constant 4 : index
%c4096_41 = arith.constant 4096 : index
%c4096_42 = arith.constant 4096 : index
%c64_43 = arith.constant 64 : index
%c1_44 = arith.constant 1 : index
%c1_45 = arith.constant 1 : index
%c1_46 = arith.constant 1 : index
%c32_47 = arith.constant 32 : index
%c32_48 = arith.constant 32 : index
%c0_49 = arith.constant 0 : index
%c0_50 = arith.constant 0 : index
%c0_51 = arith.constant 0 : index
%c0_52 = arith.constant 0 : index
%c0_53 = arith.constant 0 : index
%c0_54 = arith.constant 0 : index
air.dma_memcpy_nd (%arg18[%c0_31, %c0_32, %5, %6] [%c1_45, %c1_46, %c32_47, %c32_48] [%c4096_41, %c4096_42, %c64_43, %c1_44], %alloc_29[%c0_54, %c0_53, %c0_52, %c0_51, %c0_50, %c0_49] [%c1_36, %c1_37, %c8, %c4_38, %c8_39, %c4_40] [%c1024, %c1024_33, %c16, %c4_34, %c128, %c1_35]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
memref.dealloc %alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
%subview_10 = memref.subview %alloc_9[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
%transpose = memref.transpose %subview_10 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
%c0_11 = arith.constant 0 : index
%c0_12 = arith.constant 0 : index
%c0_13 = arith.constant 0 : index
%c0_14 = arith.constant 0 : index
%c64_15 = arith.constant 64 : index
%c1_16 = arith.constant 1 : index
%c64_17 = arith.constant 64 : index
%c64_18 = arith.constant 64 : index
%c2048_19 = arith.constant 2048 : index
%c1_20 = arith.constant 1 : index
%c64_21 = arith.constant 64 : index
%c64_22 = arith.constant 64 : index
%c1_23 = arith.constant 1 : index
%c1_24 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%c4096_25 = arith.constant 4096 : index
air.dma_memcpy_nd (%arg13[%3, %4] [%c64_21, %c64_22] [%c2048_19, %c1_20], %alloc_9[%c0_11, %c0_12, %c0_13, %c0_14] [%c1_24, %c1_23, %c64_17, %c64_18] [%c4096_25, %c4096, %c64_15, %c1_16]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_9 : memref<1x1x64x64xi32, 1 : i32>
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
air.segment @segment_0 args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %arg9[%3, %c0_0] [%c64, %c512] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%alloc_1 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_1[] [] [], %arg10[%c0_0, %4] [%c512, %c64] [%c2048, %c1]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%alloc_2 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.herd @herd_0 tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%alloc_2, %arg17=%alloc, %arg18=%alloc_1) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c32_3 = arith.constant 32 : index
%c4096_4 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_5 = arith.constant 1 : index
%c512_6 = arith.constant 512 : index
%c2048_7 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_8 = arith.constant 0 : index
%c64_9 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg19 = %c0_8 to %c64_9 step %c4 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.dma_memcpy_nd (%alloc_11[] [] [], %arg17[%c0_8, %c0_8, %c0_8, %c0_8, %5, %7] [%c1_5, %c1_5, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_7, %c512_6, %c1_5]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.dma_memcpy_nd (%alloc_12[] [] [], %arg18[%c0_8, %c0_8, %c0_8, %c0_8, %7, %6] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_6, %c64_9, %c1_5]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_13: i32, %out: i32):
%8 = arith.muli %in, %in_13 : i32
%9 = arith.addi %out, %8 : i32
linalg.yield %9 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
air.dma_memcpy_nd (%arg16[%c0_8, %c0_8, %5, %6] [%c1_5, %c1_5, %c32_3, %c32_3] [%c4096_4, %c4096_4, %c64_9, %c1_5], %alloc_10[%c0_8, %c0_8, %c0_8, %c0_8, %c0_8, %c0_8] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_5]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
air.dma_memcpy_nd (%arg11[%3, %4] [%c64, %c64] [%c2048, %c1], %alloc_2[%c0_0, %c0_0, %c0_0, %c0_0] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_1 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x64x64xi32, 1 : i32>
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before AIRDependency (air-dependency) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
air.segment @segment_0 args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %arg9[%3, %c0_0] [%c64, %c512] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%alloc_1 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_1[] [] [], %arg10[%c0_0, %4] [%c512, %c64] [%c2048, %c1]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%alloc_2 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.herd @herd_0 tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%alloc_2, %arg17=%alloc, %arg18=%alloc_1) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c32_3 = arith.constant 32 : index
%c4096_4 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_5 = arith.constant 1 : index
%c512_6 = arith.constant 512 : index
%c2048_7 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_8 = arith.constant 0 : index
%c64_9 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg19 = %c0_8 to %c64_9 step %c4 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.dma_memcpy_nd (%alloc_11[] [] [], %arg17[%c0_8, %c0_8, %c0_8, %c0_8, %5, %7] [%c1_5, %c1_5, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_7, %c512_6, %c1_5]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.dma_memcpy_nd (%alloc_12[] [] [], %arg18[%c0_8, %c0_8, %c0_8, %c0_8, %7, %6] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_6, %c64_9, %c1_5]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_13: i32, %out: i32):
%8 = arith.muli %in, %in_13 : i32
%9 = arith.addi %out, %8 : i32
linalg.yield %9 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
air.dma_memcpy_nd (%arg16[%c0_8, %c0_8, %5, %6] [%c1_5, %c1_5, %c32_3, %c32_3] [%c4096_4, %c4096_4, %c64_9, %c1_5], %alloc_10[%c0_8, %c0_8, %c0_8, %c0_8, %c0_8, %c0_8] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_5]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
air.dma_memcpy_nd (%arg11[%3, %4] [%c64, %c64] [%c2048, %c1], %alloc_2[%c0_0, %c0_0, %c0_0, %c0_0] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_1 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x64x64xi32, 1 : i32>
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before AIRDependencyScheduleOpt (air-dependency-schedule-opt) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%1 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_8, %results_9 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %6 : index
} {id = 7 : i32}
%async_token_10, %results_11 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %6 : index
} {id = 8 : i32}
%async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%4 = air.herd @herd_0 async [%async_token_12, %async_token_14, %async_token_16] tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
%c32_21 = arith.constant 32 : index
%c4096_22 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_23 = arith.constant 1 : index
%c512_24 = arith.constant 512 : index
%c2048_25 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_26 = arith.constant 0 : index
%c64_27 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%async_token_28, %results_29 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %9 : index
} {id = 12 : i32}
%async_token_30, %results_31 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %9 : index
} {id = 13 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_34 = air.execute [%async_token_32] {
linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34] {id = 2 : i32}
%7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
%c0_36 = arith.constant 0 : index
%c1_37 = arith.constant 1 : index
%c4_38 = arith.constant 4 : index
%c8_39 = arith.constant 8 : index
%c32768_40 = arith.constant 32768 : index
%c2048_41 = arith.constant 2048 : index
%c512_42 = arith.constant 512 : index
%c64_43 = arith.constant 64 : index
%async_token_44, %results_45 = air.execute [%arg20] -> (index) {
%12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
air.execute_terminator %12 : index
} {id = 16 : i32}
%async_token_46, %results_47 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%9 = air.dma_memcpy_nd async [%async_token_46, %async_token_44, %arg20] (%results_47[] [] [], %arg17[%c0_36, %c0_36, %c0_36, %c0_36, %results_29, %results_45] [%c1_37, %c1_37, %c4_38, %c8_39, %c4_38, %c8_39] [%c32768_40, %c32768_40, %c8_39, %c2048_41, %c512_42, %c1_37]) {id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%async_token_48, %results_49 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%10 = air.dma_memcpy_nd async [%async_token_48, %async_token_44, %arg20] (%results_49[] [] [], %arg18[%c0_36, %c0_36, %c0_36, %c0_36, %results_45, %results_31] [%c1_37, %c1_37, %c8_39, %c4_38, %c8_39, %c4_38] [%c32768_40, %c32768_40, %c4_38, %c512_42, %c64_43, %c1_37]) {id = 4 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
%async_token_50 = air.execute [%10, %9, %arg20] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_47, %results_49 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_53: i32, %out: i32):
%12 = arith.muli %in, %in_53 : i32
%13 = arith.addi %out, %12 : i32
linalg.yield %13 : i32
}
} {id = 19 : i32}
%async_token_51 = air.execute [%async_token_50] {
memref.dealloc %results_47 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_52 = air.execute [%async_token_50] {
memref.dealloc %results_49 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%11 = air.wait_all async [%arg20, %async_token_50] {id = 1 : i32}
scf.yield %11 : !air.async.token
}
%8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 5 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_35 = air.execute [%8] {
memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%5 = air.dma_memcpy_nd async [%async_token_8, %async_token_10, %4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 6 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
%async_token_18 = air.execute [%4, %2] {
memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_19 = air.execute [%4, %3] {
memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_20 = air.execute [%5] {
memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before AIRSpecializeDmaBroadcast (air-specialize-dma-broadcast) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%1 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_8, %results_9 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %6 : index
} {id = 7 : i32}
%async_token_10, %results_11 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %6 : index
} {id = 8 : i32}
%async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%4 = air.herd @herd_0 async [%async_token_12, %async_token_14, %async_token_16] tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
%c32_21 = arith.constant 32 : index
%c4096_22 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_23 = arith.constant 1 : index
%c512_24 = arith.constant 512 : index
%c2048_25 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_26 = arith.constant 0 : index
%c64_27 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%async_token_28, %results_29 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %9 : index
} {id = 12 : i32}
%async_token_30, %results_31 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %9 : index
} {id = 13 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_34 = air.execute [%async_token_32] {
linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34] {id = 2 : i32}
%7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
%async_token_36, %results_37 = air.execute [%arg20] -> (index) {
%12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
air.execute_terminator %12 : index
} {id = 16 : i32}
%async_token_38, %results_39 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%9 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_26, %c0_26, %c0_26, %c0_26, %results_29, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_pattern = affine_set<(d0, d1)[s0] : (d0 - s0 == 0, d1 >= 0, -d1 + 1 >= 0, s0 >= 0, -s0 + 1 >= 0)>, id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%async_token_40, %results_41 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%10 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_26, %c0_26, %c0_26, %c0_26, %results_37, %results_31] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_pattern = affine_set<(d0, d1)[s0] : (d0 >= 0, -d0 + 1 >= 0, d1 - s0 == 0, s0 >= 0, -s0 + 1 >= 0)>, id = 4 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
%async_token_42 = air.execute [%10, %9, %arg20] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_39, %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_45: i32, %out: i32):
%12 = arith.muli %in, %in_45 : i32
%13 = arith.addi %out, %12 : i32
linalg.yield %13 : i32
}
} {id = 19 : i32}
%async_token_43 = air.execute [%async_token_42] {
memref.dealloc %results_39 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_44 = air.execute [%async_token_42] {
memref.dealloc %results_41 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%11 = air.wait_all async [%arg20, %async_token_42] {id = 1 : i32}
scf.yield %11 : !air.async.token
}
%8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 5 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_35 = air.execute [%8] {
memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%5 = air.dma_memcpy_nd async [%async_token_8, %async_token_10, %4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 6 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
%async_token_18 = air.execute [%4, %2] {
memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_19 = air.execute [%4, %3] {
memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_20 = air.execute [%5] {
memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before DmaToChannel (air-dma-to-channel) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%1 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_8, %results_9 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %6 : index
} {id = 7 : i32}
%async_token_10, %results_11 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %6 : index
} {id = 8 : i32}
%async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%4 = air.herd @herd_0 async [%async_token_12, %async_token_14, %async_token_16] tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
%c32_21 = arith.constant 32 : index
%c4096_22 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_23 = arith.constant 1 : index
%c512_24 = arith.constant 512 : index
%c2048_25 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_26 = arith.constant 0 : index
%c64_27 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%async_token_28, %results_29 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %9 : index
} {id = 12 : i32}
%async_token_30, %results_31 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %9 : index
} {id = 13 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_34 = air.execute [%async_token_32] {
linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34] {id = 2 : i32}
%7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
%async_token_36, %results_37 = air.execute [%arg20] -> (index) {
%12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
air.execute_terminator %12 : index
} {id = 16 : i32}
%async_token_38, %results_39 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%9 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg12, %arg13] -> !air.async.token {
%c0_45 = arith.constant 0 : index
%c0_46 = arith.constant 0 : index
%c0_47 = arith.constant 0 : index
%c0_48 = arith.constant 0 : index
%c0_49 = arith.constant 0 : index
%12 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_49, %c0_48, %c0_47, %c0_46, %c0_45, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>, id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
affine.yield %12 : !air.async.token
} else {
%c32_45 = arith.constant 32 : index
%c0_46 = arith.constant 0 : index
%c0_47 = arith.constant 0 : index
%c0_48 = arith.constant 0 : index
%c0_49 = arith.constant 0 : index
%12 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_49, %c0_48, %c0_47, %c0_46, %c32_45, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 - 1 == 0, s1 >= 0, -s1 + 1 >= 0)>, id = 4 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
affine.yield %12 : !air.async.token
}
%async_token_40, %results_41 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%10 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg12, %arg13] -> !air.async.token {
%c0_45 = arith.constant 0 : index
%c0_46 = arith.constant 0 : index
%c0_47 = arith.constant 0 : index
%c0_48 = arith.constant 0 : index
%c0_49 = arith.constant 0 : index
%12 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_49, %c0_48, %c0_47, %c0_46, %results_37, %c0_45] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>, id = 5 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
affine.yield %12 : !air.async.token
} else {
%c32_45 = arith.constant 32 : index
%c0_46 = arith.constant 0 : index
%c0_47 = arith.constant 0 : index
%c0_48 = arith.constant 0 : index
%c0_49 = arith.constant 0 : index
%12 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_49, %c0_48, %c0_47, %c0_46, %results_37, %c32_45] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 - 1 == 0)>, id = 6 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
affine.yield %12 : !air.async.token
}
%async_token_42 = air.execute [%10, %9, %arg20] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_39, %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_45: i32, %out: i32):
%12 = arith.muli %in, %in_45 : i32
%13 = arith.addi %out, %12 : i32
linalg.yield %13 : i32
}
} {id = 19 : i32}
%async_token_43 = air.execute [%async_token_42] {
memref.dealloc %results_39 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_44 = air.execute [%async_token_42] {
memref.dealloc %results_41 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%11 = air.wait_all async [%arg20, %async_token_42] {id = 1 : i32}
scf.yield %11 : !air.async.token
}
%8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 7 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_35 = air.execute [%8] {
memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%5 = air.dma_memcpy_nd async [%async_token_8, %async_token_10, %4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 8 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
%async_token_18 = air.execute [%4, %2] {
memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_19 = air.execute [%4, %3] {
memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_20 = air.execute [%5] {
memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
air.channel @channel_7 [1, 1]
air.channel @channel_6 [2, 2]
air.channel @channel_5 [1, 1]
air.channel @channel_4 [1, 1]
air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%async_token_8, %results_9 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%1 = air.channel.put async [%async_token_8] @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
%c2048 = arith.constant 2048 : index
%c64_10 = arith.constant 64 : index
%c1_11 = arith.constant 1 : index
%c512_12 = arith.constant 512 : index
%c0_13 = arith.constant 0 : index
%async_token_14, %results_15 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%2 = air.channel.put async [%async_token_14] @channel_5[] (%arg5[%c0_13, %results_15] [%c512_12, %c64_10] [%c2048, %c1_11]) : (memref<512x2048xi32>)
%c2048_16 = arith.constant 2048 : index
%c64_17 = arith.constant 64 : index
%c1_18 = arith.constant 1 : index
%async_token_19, %results_20 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%async_token_21, %results_22 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%3 = air.channel.get async [%async_token_19, %async_token_21] @channel_7[] (%arg6[%results_20, %results_22] [%c64_17, %c64_17] [%c2048_16, %c1_18]) : (memref<2048x2048xi32>)
%4 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
%c4096 = arith.constant 4096 : index
%c2048_23 = arith.constant 2048 : index
%c64_24 = arith.constant 64 : index
%c1_25 = arith.constant 1 : index
%c512_26 = arith.constant 512 : index
%c0_27 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_28, %results_29 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %23 : index
} {id = 7 : i32}
%async_token_30, %results_31 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %23 : index
} {id = 8 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%5 = air.channel.get async [%async_token_28, %async_token_32] @channel_4[] (%results_33[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
%async_token_34, %results_35 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%6 = air.channel.get async [%async_token_30, %async_token_34] @channel_5[] (%results_35[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
%async_token_36, %results_37 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%c0_38 = arith.constant 0 : index
%c1_39 = arith.constant 1 : index
%c512_40 = arith.constant 512 : index
%c2048_41 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_42 = arith.constant 0 : index
%c64_43 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%async_token_44, %results_45 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
air.execute_terminator %23 : index
} {id = 12 : i32}
%async_token_46, %results_47 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
air.execute_terminator %23 : index
} {id = 13 : i32}
%async_token_48, %results_49 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%7 = air.wait_all async
%8 = air.wait_all async [%async_token_32, %async_token_44, %async_token_46, %7] {id = 2 : i32}
%9 = scf.for %arg12 = %c0_42 to %c64_43 step %c4 iter_args(%arg13 = %8) -> (!air.async.token) {
%c1_99 = arith.constant 1 : index
%c4_100 = arith.constant 4 : index
%c8_101 = arith.constant 8 : index
%c32768_102 = arith.constant 32768 : index
%c2048_103 = arith.constant 2048 : index
%c512_104 = arith.constant 512 : index
%c64_105 = arith.constant 64 : index
%async_token_106, %results_107 = air.execute [%arg13] -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
air.execute_terminator %25 : index
} {id = 16 : i32}
%c0_108 = arith.constant 0 : index
%c0_109 = arith.constant 0 : index
%c0_110 = arith.constant 0 : index
%c0_111 = arith.constant 0 : index
%c0_112 = arith.constant 0 : index
%23 = air.channel.put async [%async_token_106, %async_token_32, %arg13] @channel_0[] (%results_33[%c0_112, %c0_111, %c0_110, %c0_109, %c0_108, %results_107] [%c1_39, %c1_39, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_41, %c512_40, %c1_39]) : (memref<1x1x64x512xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.yield %24 : !air.async.token
}
%c1_50 = arith.constant 1 : index
%c512_51 = arith.constant 512 : index
%c2048_52 = arith.constant 2048 : index
%c8_53 = arith.constant 8 : index
%c32768_54 = arith.constant 32768 : index
%c0_55 = arith.constant 0 : index
%c64_56 = arith.constant 64 : index
%c4_57 = arith.constant 4 : index
%async_token_58, %results_59 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
air.execute_terminator %23 : index
} {id = 12 : i32}
%async_token_60, %results_61 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
air.execute_terminator %23 : index
} {id = 13 : i32}
%async_token_62, %results_63 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%10 = air.wait_all async
%11 = air.wait_all async [%async_token_32, %async_token_58, %async_token_60, %10] {id = 2 : i32}
%12 = scf.for %arg12 = %c0_55 to %c64_56 step %c4_57 iter_args(%arg13 = %11) -> (!air.async.token) {
%c1_99 = arith.constant 1 : index
%c4_100 = arith.constant 4 : index
%c8_101 = arith.constant 8 : index
%c32768_102 = arith.constant 32768 : index
%c2048_103 = arith.constant 2048 : index
%c512_104 = arith.constant 512 : index
%c64_105 = arith.constant 64 : index
%async_token_106, %results_107 = air.execute [%arg13] -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
air.execute_terminator %25 : index
} {id = 16 : i32}
%c32_108 = arith.constant 32 : index
%c0_109 = arith.constant 0 : index
%c0_110 = arith.constant 0 : index
%c0_111 = arith.constant 0 : index
%c0_112 = arith.constant 0 : index
%23 = air.channel.put async [%async_token_106, %async_token_32, %arg13] @channel_1[] (%results_33[%c0_112, %c0_111, %c0_110, %c0_109, %c32_108, %results_107] [%c1_50, %c1_50, %c4_57, %c8_53, %c4_57, %c8_53] [%c32768_54, %c32768_54, %c8_53, %c2048_52, %c512_51, %c1_50]) : (memref<1x1x64x512xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.yield %24 : !air.async.token
}
%c0_64 = arith.constant 0 : index
%c1_65 = arith.constant 1 : index
%c512_66 = arith.constant 512 : index
%c8_67 = arith.constant 8 : index
%c32768_68 = arith.constant 32768 : index
%c0_69 = arith.constant 0 : index
%c64_70 = arith.constant 64 : index
%c4_71 = arith.constant 4 : index
%async_token_72, %results_73 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
air.execute_terminator %23 : index
} {id = 12 : i32}
%async_token_74, %results_75 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
air.execute_terminator %23 : index
} {id = 13 : i32}
%async_token_76, %results_77 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%13 = air.wait_all async
%14 = air.wait_all async [%async_token_34, %async_token_72, %async_token_74, %13] {id = 2 : i32}
%15 = scf.for %arg12 = %c0_69 to %c64_70 step %c4_71 iter_args(%arg13 = %14) -> (!air.async.token) {
%c1_99 = arith.constant 1 : index
%c8_100 = arith.constant 8 : index
%c4_101 = arith.constant 4 : index
%c32768_102 = arith.constant 32768 : index
%c512_103 = arith.constant 512 : index
%c64_104 = arith.constant 64 : index
%async_token_105, %results_106 = air.execute [%arg13] -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
air.execute_terminator %25 : index
} {id = 16 : i32}
%c0_107 = arith.constant 0 : index
%c0_108 = arith.constant 0 : index
%c0_109 = arith.constant 0 : index
%c0_110 = arith.constant 0 : index
%c0_111 = arith.constant 0 : index
%23 = air.channel.put async [%async_token_105, %async_token_34, %arg13] @channel_2[] (%results_35[%c0_111, %c0_110, %c0_109, %c0_108, %results_106, %c0_107] [%c1_65, %c1_65, %c8_67, %c4_71, %c8_67, %c4_71] [%c32768_68, %c32768_68, %c4_71, %c512_66, %c64_70, %c1_65]) : (memref<1x1x512x64xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.yield %24 : !air.async.token
}
%c1_78 = arith.constant 1 : index
%c512_79 = arith.constant 512 : index
%c8_80 = arith.constant 8 : index
%c32768_81 = arith.constant 32768 : index
%c0_82 = arith.constant 0 : index
%c64_83 = arith.constant 64 : index
%c4_84 = arith.constant 4 : index
%async_token_85, %results_86 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
air.execute_terminator %23 : index
} {id = 12 : i32}
%async_token_87, %results_88 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
air.execute_terminator %23 : index
} {id = 13 : i32}
%async_token_89, %results_90 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%16 = air.wait_all async
%17 = air.wait_all async [%async_token_34, %async_token_85, %async_token_87, %16] {id = 2 : i32}
%18 = scf.for %arg12 = %c0_82 to %c64_83 step %c4_84 iter_args(%arg13 = %17) -> (!air.async.token) {
%c1_99 = arith.constant 1 : index
%c8_100 = arith.constant 8 : index
%c4_101 = arith.constant 4 : index
%c32768_102 = arith.constant 32768 : index
%c512_103 = arith.constant 512 : index
%c64_104 = arith.constant 64 : index
%async_token_105, %results_106 = air.execute [%arg13] -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
air.execute_terminator %25 : index
} {id = 16 : i32}
%c32_107 = arith.constant 32 : index
%c0_108 = arith.constant 0 : index
%c0_109 = arith.constant 0 : index
%c0_110 = arith.constant 0 : index
%c0_111 = arith.constant 0 : index
%23 = air.channel.put async [%async_token_105, %async_token_34, %arg13] @channel_3[] (%results_35[%c0_111, %c0_110, %c0_109, %c0_108, %results_106, %c32_107] [%c1_78, %c1_78, %c8_80, %c4_84, %c8_80, %c4_84] [%c32768_81, %c32768_81, %c4_84, %c512_79, %c64_83, %c1_78]) : (memref<1x1x512x64xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.yield %24 : !air.async.token
}
%c1_91 = arith.constant 1 : index
%c0_92 = arith.constant 0 : index
%c0_93 = arith.constant 0 : index
%c2_94 = arith.constant 2 : index
%c2_95 = arith.constant 2 : index
%19 = air.wait_all async [%async_token_36]
%20 = scf.parallel (%arg12, %arg13) = (%c0_92, %c0_93) to (%c2_94, %c2_95) step (%c1_91, %c1_91) init (%19) -> !air.async.token {
%c32_99 = arith.constant 32 : index
%c4096_100 = arith.constant 4096 : index
%c1_101 = arith.constant 1 : index
%c0_102 = arith.constant 0 : index
%c64_103 = arith.constant 64 : index
%async_token_104, %results_105 = air.execute -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %25 : index
} {id = 12 : i32}
%async_token_106, %results_107 = air.execute -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %25 : index
} {id = 13 : i32}
%23 = air.channel.get async [%async_token_106, %async_token_104, %async_token_36, %19] @channel_6[%arg12, %arg13] (%results_37[%c0_102, %c0_102, %results_105, %results_107] [%c1_101, %c1_101, %c32_99, %c32_99] [%c4096_100, %c4096_100, %c64_103, %c1_101]) : (memref<1x1x64x64xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.reduce(%24 : !air.async.token) {
^bb0(%arg14: !air.async.token, %arg15: !air.async.token):
%25 = air.wait_all async [%arg14, %arg15]
scf.reduce.return %25 : !air.async.token
}
}
%21 = air.herd @herd_0 async [%async_token_32, %async_token_34, %async_token_36] tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_37, %arg17=%results_33, %arg18=%results_35) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
%c32_99 = arith.constant 32 : index
%c4096_100 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_101 = arith.constant 1 : index
%c512_102 = arith.constant 512 : index
%c2048_103 = arith.constant 2048 : index
%c8_104 = arith.constant 8 : index
%c32768_105 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_106 = arith.constant 0 : index
%c64_107 = arith.constant 64 : index
%c4_108 = arith.constant 4 : index
%async_token_109, %results_110 = air.execute -> (index) {
%26 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %26 : index
} {id = 12 : i32}
%async_token_111, %results_112 = air.execute -> (index) {
%26 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %26 : index
} {id = 13 : i32}
%async_token_113, %results_114 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_115 = air.execute [%async_token_113] {
linalg.fill ins(%c0_i32 : i32) outs(%results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%23 = air.wait_all async [%async_token_109, %async_token_111, %async_token_115] {id = 2 : i32}
%24 = scf.for %arg19 = %c0_106 to %c64_107 step %c4_108 iter_args(%arg20 = %23) -> (!air.async.token) {
%async_token_117, %results_118 = air.execute [%arg20] -> (index) {
%29 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
air.execute_terminator %29 : index
} {id = 16 : i32}
%async_token_119, %results_120 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%26 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg12, %arg13] -> !air.async.token {
%c0_126 = arith.constant 0 : index
%c0_127 = arith.constant 0 : index
%c0_128 = arith.constant 0 : index
%c0_129 = arith.constant 0 : index
%c0_130 = arith.constant 0 : index
%29 = air.channel.get async [%async_token_119, %async_token_117, %arg20] @channel_0[%arg12, %arg13] (%results_120[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %29 : !air.async.token
} else {
%c32_126 = arith.constant 32 : index
%c0_127 = arith.constant 0 : index
%c0_128 = arith.constant 0 : index
%c0_129 = arith.constant 0 : index
%c0_130 = arith.constant 0 : index
%29 = air.channel.get async [%async_token_119, %async_token_117, %arg20] @channel_1[%arg12, %arg13] (%results_120[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %29 : !air.async.token
}
%async_token_121, %results_122 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%27 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg12, %arg13] -> !air.async.token {
%c0_126 = arith.constant 0 : index
%c0_127 = arith.constant 0 : index
%c0_128 = arith.constant 0 : index
%c0_129 = arith.constant 0 : index
%c0_130 = arith.constant 0 : index
%29 = air.channel.get async [%async_token_121, %async_token_117, %arg20] @channel_2[%arg12, %arg13] (%results_122[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %29 : !air.async.token
} else {
%c32_126 = arith.constant 32 : index
%c0_127 = arith.constant 0 : index
%c0_128 = arith.constant 0 : index
%c0_129 = arith.constant 0 : index
%c0_130 = arith.constant 0 : index
%29 = air.channel.get async [%async_token_121, %async_token_117, %arg20] @channel_3[%arg12, %arg13] (%results_122[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %29 : !air.async.token
}
%async_token_123 = air.execute [%27, %26, %arg20] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_120, %results_122 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_126: i32, %out: i32):
%29 = arith.muli %in, %in_126 : i32
%30 = arith.addi %out, %29 : i32
linalg.yield %30 : i32
}
} {id = 19 : i32}
%async_token_124 = air.execute [%async_token_123] {
memref.dealloc %results_120 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_125 = air.execute [%async_token_123] {
memref.dealloc %results_122 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%28 = air.wait_all async [%arg20, %async_token_123] {id = 1 : i32}
scf.yield %28 : !air.async.token
}
%25 = air.channel.put async [%24] @channel_6[%arg12, %arg13] (%results_114[%c0_106, %c0_106, %c0_106, %c0_106, %c0_106, %c0_106] [%c1_101, %c1_101, %c8_104, %c4_108, %c8_104, %c4_108] [%c1024, %c1024, %c16, %c4_108, %c128, %c1_101]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_116 = air.execute [%25] {
memref.dealloc %results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%22 = air.channel.put async [%async_token_28, %async_token_30, %21] @channel_7[] (%results_37[%c0_27, %c0_27, %c0_27, %c0_27] [%c1_25, %c1_25, %c64_24, %c64_24] [%c4096, %c4096, %c64_24, %c1_25]) : (memref<1x1x64x64xi32, 1 : i32>)
%async_token_96 = air.execute [%21, %5] {
memref.dealloc %results_33 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_97 = air.execute [%21, %6] {
memref.dealloc %results_35 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_98 = air.execute [%22] {
memref.dealloc %results_37 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
air.channel @channel_7 [1, 1]
air.channel @channel_6 [2, 2]
air.channel @channel_5 [1, 1]
air.channel @channel_4 [1, 1]
air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%async_token_8, %results_9 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%1 = air.channel.put async [%async_token_8] @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
%async_token_10, %results_11 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%2 = air.channel.put async [%async_token_10] @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) : (memref<512x2048xi32>)
%async_token_12, %results_13 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%async_token_14, %results_15 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%3 = air.channel.get async [%async_token_12, %async_token_14] @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) : (memref<2048x2048xi32>)
%4 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
%c32_16 = arith.constant 32 : index
%c4 = arith.constant 4 : index
%c32768 = arith.constant 32768 : index
%c8 = arith.constant 8 : index
%c4096 = arith.constant 4096 : index
%c2048_17 = arith.constant 2048 : index
%c64_18 = arith.constant 64 : index
%c1_19 = arith.constant 1 : index
%c512_20 = arith.constant 512 : index
%c0_21 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_22, %results_23 = air.execute -> (index) {
%14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %14 : index
} {id = 7 : i32}
%async_token_24, %results_25 = air.execute -> (index) {
%14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %14 : index
} {id = 8 : i32}
%async_token_26, %results_27 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%5 = air.channel.get async [%async_token_22, %async_token_26] @channel_4[] (%results_27[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
%async_token_28, %results_29 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%6 = air.channel.get async [%async_token_24, %async_token_28] @channel_5[] (%results_29[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
%async_token_30, %results_31 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%7 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_26) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %15 : index
} {id = 16 : i32}
%14 = air.channel.put async [%async_token_43, %async_token_26, %arg10] @channel_0[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %14 : !air.async.token
}
%async_token_34, %results_35 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%8 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_26) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %15 : index
} {id = 16 : i32}
%14 = air.channel.put async [%async_token_43, %async_token_26, %arg10] @channel_1[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %14 : !air.async.token
}
%async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%9 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_28) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %15 : index
} {id = 16 : i32}
%14 = air.channel.put async [%async_token_43, %async_token_28, %arg10] @channel_2[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %14 : !air.async.token
}
%async_token_38, %results_39 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_28) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %15 : index
} {id = 16 : i32}
%14 = air.channel.put async [%async_token_43, %async_token_28, %arg10] @channel_3[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %14 : !air.async.token
}
%11 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_30) -> !air.async.token {
%async_token_43, %results_44 = air.execute -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
air.execute_terminator %15 : index
} {id = 12 : i32}
%async_token_45, %results_46 = air.execute -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
air.execute_terminator %15 : index
} {id = 13 : i32}
%14 = air.channel.get async [%async_token_45, %async_token_43, %async_token_30, %async_token_30] @channel_6[%arg9, %arg10] (%results_31[%c0_21, %c0_21, %results_44, %results_46] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
scf.reduce(%14 : !air.async.token) {
^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
%15 = air.wait_all async [%arg11, %arg12]
scf.reduce.return %15 : !air.async.token
}
}
%12 = air.herd @herd_0 async [%async_token_26, %async_token_28, %async_token_30] tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 1 : i32} {
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_43 = arith.constant 1 : index
%c8_44 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%c0_45 = arith.constant 0 : index
%c64_46 = arith.constant 64 : index
%c4_47 = arith.constant 4 : index
%async_token_48, %results_49 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
air.execute_terminator %17 : index
} {id = 12 : i32}
%async_token_50, %results_51 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
air.execute_terminator %17 : index
} {id = 13 : i32}
%async_token_52, %results_53 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_54 = air.execute [%async_token_52] {
linalg.fill ins(%c0_i32 : i32) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%14 = air.wait_all async [%async_token_48, %async_token_50, %async_token_54] {id = 2 : i32}
%15 = scf.for %arg13 = %c0_45 to %c64_46 step %c4_47 iter_args(%arg14 = %14) -> (!air.async.token) {
%async_token_56, %results_57 = air.execute [%arg14] -> (index) {
%20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg13]
air.execute_terminator %20 : index
} {id = 16 : i32}
%async_token_58, %results_59 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%17 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
%20 = air.channel.get async [%async_token_58, %async_token_56, %arg14] @channel_0[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_58, %async_token_56, %arg14] @channel_1[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_60, %results_61 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%18 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
%20 = air.channel.get async [%async_token_60, %async_token_56, %arg14] @channel_2[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_60, %async_token_56, %arg14] @channel_3[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_62 = air.execute [%18, %17, %arg14] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_59, %results_61 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_65: i32, %out: i32):
%20 = arith.muli %in, %in_65 : i32
%21 = arith.addi %out, %20 : i32
linalg.yield %21 : i32
}
} {id = 19 : i32}
%async_token_63 = air.execute [%async_token_62] {
memref.dealloc %results_59 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_64 = air.execute [%async_token_62] {
memref.dealloc %results_61 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%19 = air.wait_all async [%arg14, %async_token_62] {id = 1 : i32}
scf.yield %19 : !air.async.token
}
%16 = air.channel.put async [%15] @channel_6[%arg9, %arg10] (%results_53[%c0_45, %c0_45, %c0_45, %c0_45, %c0_45, %c0_45] [%c1_43, %c1_43, %c8_44, %c4_47, %c8_44, %c4_47] [%c1024, %c1024, %c16, %c4_47, %c128, %c1_43]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_55 = air.execute [%16] {
memref.dealloc %results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%13 = air.channel.put async [%async_token_22, %async_token_24, %12] @channel_7[] (%results_31[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
%async_token_40 = air.execute [%12, %5] {
memref.dealloc %results_27 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_41 = air.execute [%12, %6] {
memref.dealloc %results_29 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_42 = air.execute [%13] {
memref.dealloc %results_31 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before AIRDependencyCanonicalize (air-dependency-canonicalize) //----- //
module {
air.channel @channel_7 [1, 1]
air.channel @channel_6 [2, 2]
air.channel @channel_5 [1, 1]
air.channel @channel_4 [1, 1]
air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%async_token_8, %results_9 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%1 = air.channel.put async [%async_token_8] @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
%async_token_10, %results_11 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%2 = air.channel.put async [%async_token_10] @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) : (memref<512x2048xi32>)
%async_token_12, %results_13 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%async_token_14, %results_15 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%3 = air.channel.get async [%async_token_12, %async_token_14] @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) : (memref<2048x2048xi32>)
%4 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
%c32_16 = arith.constant 32 : index
%c4 = arith.constant 4 : index
%c32768 = arith.constant 32768 : index
%c8 = arith.constant 8 : index
%c4096 = arith.constant 4096 : index
%c2048_17 = arith.constant 2048 : index
%c64_18 = arith.constant 64 : index
%c1_19 = arith.constant 1 : index
%c512_20 = arith.constant 512 : index
%c0_21 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_22, %results_23 = air.execute -> (index) {
%14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %14 : index
} {id = 7 : i32}
%async_token_24, %results_25 = air.execute -> (index) {
%14 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %14 : index
} {id = 8 : i32}
%async_token_26, %results_27 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%5 = air.channel.get async [%async_token_22, %async_token_26] @channel_4[] (%results_27[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
%async_token_28, %results_29 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%6 = air.channel.get async [%async_token_24, %async_token_28] @channel_5[] (%results_29[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
%async_token_30, %results_31 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%7 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_26) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %15 : index
} {id = 16 : i32}
%14 = air.channel.put async [%async_token_43, %async_token_26, %arg10] @channel_0[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %14 : !air.async.token
}
%async_token_34, %results_35 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%8 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_26) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %15 : index
} {id = 16 : i32}
%14 = air.channel.put async [%async_token_43, %async_token_26, %arg10] @channel_1[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %14 : !air.async.token
}
%async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%9 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_28) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %15 : index
} {id = 16 : i32}
%14 = air.channel.put async [%async_token_43, %async_token_28, %arg10] @channel_2[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %14 : !air.async.token
}
%async_token_38, %results_39 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_28) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %15 : index
} {id = 16 : i32}
%14 = air.channel.put async [%async_token_43, %async_token_28, %arg10] @channel_3[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %14 : !air.async.token
}
%11 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_30) -> !air.async.token {
%async_token_43, %results_44 = air.execute -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
air.execute_terminator %15 : index
} {id = 12 : i32}
%async_token_45, %results_46 = air.execute -> (index) {
%15 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
air.execute_terminator %15 : index
} {id = 13 : i32}
%14 = air.channel.get async [%async_token_45, %async_token_43, %async_token_30, %async_token_30] @channel_6[%arg9, %arg10] (%results_31[%c0_21, %c0_21, %results_44, %results_46] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
scf.reduce(%14 : !air.async.token) {
^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
%15 = air.wait_all async [%arg11, %arg12]
scf.reduce.return %15 : !air.async.token
}
}
%12 = air.herd @herd_0 async [%async_token_26, %async_token_28, %async_token_30] tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 1 : i32} {
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_43 = arith.constant 1 : index
%c8_44 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%c0_45 = arith.constant 0 : index
%c64_46 = arith.constant 64 : index
%c4_47 = arith.constant 4 : index
%async_token_48, %results_49 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
air.execute_terminator %17 : index
} {id = 12 : i32}
%async_token_50, %results_51 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
air.execute_terminator %17 : index
} {id = 13 : i32}
%async_token_52, %results_53 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_54 = air.execute [%async_token_52] {
linalg.fill ins(%c0_i32 : i32) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%14 = air.wait_all async [%async_token_48, %async_token_50, %async_token_54] {id = 2 : i32}
%15 = scf.for %arg13 = %c0_45 to %c64_46 step %c4_47 iter_args(%arg14 = %14) -> (!air.async.token) {
%async_token_56, %results_57 = air.execute [%arg14] -> (index) {
%20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg13]
air.execute_terminator %20 : index
} {id = 16 : i32}
%async_token_58, %results_59 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%17 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
%20 = air.channel.get async [%async_token_58, %async_token_56, %arg14] @channel_0[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_58, %async_token_56, %arg14] @channel_1[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_60, %results_61 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%18 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
%20 = air.channel.get async [%async_token_60, %async_token_56, %arg14] @channel_2[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_60, %async_token_56, %arg14] @channel_3[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_62 = air.execute [%18, %17, %arg14] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_59, %results_61 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_65: i32, %out: i32):
%20 = arith.muli %in, %in_65 : i32
%21 = arith.addi %out, %20 : i32
linalg.yield %21 : i32
}
} {id = 19 : i32}
%async_token_63 = air.execute [%async_token_62] {
memref.dealloc %results_59 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_64 = air.execute [%async_token_62] {
memref.dealloc %results_61 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%19 = air.wait_all async [%arg14, %async_token_62] {id = 1 : i32}
scf.yield %19 : !air.async.token
}
%16 = air.channel.put async [%15] @channel_6[%arg9, %arg10] (%results_53[%c0_45, %c0_45, %c0_45, %c0_45, %c0_45, %c0_45] [%c1_43, %c1_43, %c8_44, %c4_47, %c8_44, %c4_47] [%c1024, %c1024, %c16, %c4_47, %c128, %c1_43]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_55 = air.execute [%16] {
memref.dealloc %results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%13 = air.channel.put async [%async_token_22, %async_token_24, %12] @channel_7[] (%results_31[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
%async_token_40 = air.execute [%12, %5] {
memref.dealloc %results_27 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_41 = air.execute [%12, %6] {
memref.dealloc %results_29 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_42 = air.execute [%13] {
memref.dealloc %results_31 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
air.channel @channel_7 [1, 1]
air.channel @channel_6 [2, 2]
air.channel @channel_5 [1, 1]
air.channel @channel_4 [1, 1]
air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 1 : i32} {
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%async_token_8, %results_9 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
}
%1 = air.channel.put async [%async_token_8] @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<2048x512xi32>)
%async_token_10, %results_11 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
}
%2 = air.channel.put async [%async_token_10] @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<512x2048xi32>)
%async_token_12, %results_13 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
}
%async_token_14, %results_15 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
}
%3 = air.channel.get async [%async_token_12, %async_token_14] @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) {id = 3 : i32} : (memref<2048x2048xi32>)
%4 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
%c32_16 = arith.constant 32 : index
%c4 = arith.constant 4 : index
%c32768 = arith.constant 32768 : index
%c8 = arith.constant 8 : index
%c4096 = arith.constant 4096 : index
%c2048_17 = arith.constant 2048 : index
%c64_18 = arith.constant 64 : index
%c1_19 = arith.constant 1 : index
%c512_20 = arith.constant 512 : index
%c0_21 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%5 = air.wait_all async
%6 = air.wait_all async
%async_token_22, %results_23 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
}
%7 = air.channel.get async [%5, %async_token_22] @channel_4[] (%results_23[] [] []) {id = 4 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
%async_token_24, %results_25 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
}
%8 = air.channel.get async [%6, %async_token_24] @channel_5[] (%results_25[] [] []) {id = 5 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
%async_token_26, %results_27 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
}
%9 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_22) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg10] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_0[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 6 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_22) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg10] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_1[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 7 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%11 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_24) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg10] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_2[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 8 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%12 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %async_token_24) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg10] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_3[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 9 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%13 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_26) -> !air.async.token {
%async_token_31, %results_32 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
air.execute_terminator %17 : index
}
%async_token_33, %results_34 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
air.execute_terminator %17 : index
}
%16 = air.channel.get async [%async_token_26, %async_token_33, %async_token_31] @channel_6[%arg9, %arg10] (%results_27[%c0_21, %c0_21, %results_32, %results_34] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) {id = 10 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
scf.reduce(%16 : !air.async.token) {
^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
%17 = air.wait_all async [%arg11, %arg12]
scf.reduce.return %17 : !air.async.token
}
}
%14 = air.herd @herd_0 async [%async_token_22, %async_token_24, %async_token_26] tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 3 : i32} {
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_31 = arith.constant 1 : index
%c8_32 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%c0_33 = arith.constant 0 : index
%c64_34 = arith.constant 64 : index
%c4_35 = arith.constant 4 : index
%async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
}
%async_token_38 = air.execute [%async_token_36] {
linalg.fill ins(%c0_i32 : i32) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>)
}
%16 = scf.for %arg13 = %c0_33 to %c64_34 step %c4_35 iter_args(%arg14 = %async_token_38) -> (!air.async.token) {
%async_token_40, %results_41 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
}
%18 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
%20 = air.channel.get async [%async_token_40, %arg14] @channel_0[%arg9, %arg10] (%results_41[] [] []) {id = 11 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_40, %arg14] @channel_1[%arg9, %arg10] (%results_41[] [] []) {id = 12 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_42, %results_43 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%19 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
%20 = air.channel.get async [%async_token_42, %arg14] @channel_2[%arg9, %arg10] (%results_43[] [] []) {id = 13 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_42, %arg14] @channel_3[%arg9, %arg10] (%results_43[] [] []) {id = 14 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_44 = air.execute [%19, %18] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_41, %results_43 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_47: i32, %out: i32):
%20 = arith.muli %in, %in_47 : i32
%21 = arith.addi %out, %20 : i32
linalg.yield %21 : i32
}
}
%async_token_45 = air.execute [%async_token_44] {
memref.dealloc %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>
}
%async_token_46 = air.execute [%async_token_44] {
memref.dealloc %results_43 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
scf.yield %async_token_44 : !air.async.token
}
%17 = air.channel.put async [%16] @channel_6[%arg9, %arg10] (%results_37[%c0_33, %c0_33, %c0_33, %c0_33, %c0_33, %c0_33] [%c1_31, %c1_31, %c8_32, %c4_35, %c8_32, %c4_35] [%c1024, %c1024, %c16, %c4_35, %c128, %c1_31]) {id = 15 : i32} : (memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_39 = air.execute [%17] {
memref.dealloc %results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>
}
air.herd_terminator
}
%15 = air.channel.put async [%5, %6, %14] @channel_7[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) {id = 16 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
%async_token_28 = air.execute [%7] {
memref.dealloc %results_23 : memref<1x1x64x512xi32, 1 : i32>
}
%async_token_29 = air.execute [%8] {
memref.dealloc %results_25 : memref<1x1x512x64xi32, 1 : i32>
}
%async_token_30 = air.execute [%15] {
memref.dealloc %results_27 : memref<1x1x64x64xi32, 1 : i32>
}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
air.channel @channel_7 [1, 1]
air.channel @channel_6 [2, 2]
air.channel @channel_5 [1, 1]
air.channel @channel_4 [1, 1]
air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 1 : i32} {
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%async_token_8, %results_9 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
}
%1 = air.channel.put async [%async_token_8] @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<2048x512xi32>)
%async_token_10, %results_11 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
}
%2 = air.channel.put async [%async_token_10] @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<512x2048xi32>)
%async_token_12, %results_13 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
}
%async_token_14, %results_15 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
}
%3 = air.channel.get async [%async_token_12, %async_token_14] @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) {id = 3 : i32} : (memref<2048x2048xi32>)
%4 = air.segment @segment_0 async attributes {id = 2 : i32} {
%c32_16 = arith.constant 32 : index
%c4 = arith.constant 4 : index
%c32768 = arith.constant 32768 : index
%c8 = arith.constant 8 : index
%c4096 = arith.constant 4096 : index
%c2048_17 = arith.constant 2048 : index
%c64_18 = arith.constant 64 : index
%c1_19 = arith.constant 1 : index
%c512_20 = arith.constant 512 : index
%c0_21 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%5 = air.wait_all async
%6 = air.wait_all async
%async_token_22, %results_23 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
}
%7 = air.channel.get async [%5, %async_token_22] @channel_4[] (%results_23[] [] []) {id = 4 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
%async_token_24, %results_25 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
}
%8 = air.channel.get async [%6, %async_token_24] @channel_5[] (%results_25[] [] []) {id = 5 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
%async_token_26, %results_27 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
}
%9 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_22) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg8] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_0[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 6 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%10 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_22) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg8] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_1[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 7 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%11 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_24) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg8] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_2[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 8 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%12 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_24) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg8] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_3[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 9 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%13 = scf.parallel (%arg7, %arg8) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_26) -> !air.async.token {
%async_token_31, %results_32 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg7]
air.execute_terminator %17 : index
}
%async_token_33, %results_34 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg8]
air.execute_terminator %17 : index
}
%16 = air.channel.get async [%async_token_26, %async_token_33, %async_token_31] @channel_6[%arg7, %arg8] (%results_27[%c0_21, %c0_21, %results_32, %results_34] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) {id = 10 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
scf.reduce(%16 : !air.async.token) {
^bb0(%arg9: !air.async.token, %arg10: !air.async.token):
%17 = air.wait_all async [%arg9, %arg10]
scf.reduce.return %17 : !air.async.token
}
}
%14 = air.herd @herd_0 async [%async_token_22, %async_token_24, %async_token_26] tile (%arg7, %arg8) in (%arg9=%c2, %arg10=%c2) attributes {id = 3 : i32} {
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_31 = arith.constant 1 : index
%c8_32 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%c0_33 = arith.constant 0 : index
%c64_34 = arith.constant 64 : index
%c4_35 = arith.constant 4 : index
%async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
}
%async_token_38 = air.execute [%async_token_36] {
linalg.fill ins(%c0_i32 : i32) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>)
}
%16 = scf.for %arg11 = %c0_33 to %c64_34 step %c4_35 iter_args(%arg12 = %async_token_38) -> (!air.async.token) {
%async_token_40, %results_41 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
}
%18 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg7, %arg8] -> !air.async.token {
%20 = air.channel.get async [%async_token_40, %arg12] @channel_0[%arg7, %arg8] (%results_41[] [] []) {id = 11 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_40, %arg12] @channel_1[%arg7, %arg8] (%results_41[] [] []) {id = 12 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_42, %results_43 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%19 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg7, %arg8] -> !air.async.token {
%20 = air.channel.get async [%async_token_42, %arg12] @channel_2[%arg7, %arg8] (%results_43[] [] []) {id = 13 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_42, %arg12] @channel_3[%arg7, %arg8] (%results_43[] [] []) {id = 14 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_44 = air.execute [%19, %18] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_41, %results_43 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_47: i32, %out: i32):
%20 = arith.muli %in, %in_47 : i32
%21 = arith.addi %out, %20 : i32
linalg.yield %21 : i32
}
}
%async_token_45 = air.execute [%async_token_44] {
memref.dealloc %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>
}
%async_token_46 = air.execute [%async_token_44] {
memref.dealloc %results_43 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
scf.yield %async_token_44 : !air.async.token
}
%17 = air.channel.put async [%16] @channel_6[%arg7, %arg8] (%results_37[%c0_33, %c0_33, %c0_33, %c0_33, %c0_33, %c0_33] [%c1_31, %c1_31, %c8_32, %c4_35, %c8_32, %c4_35] [%c1024, %c1024, %c16, %c4_35, %c128, %c1_31]) {id = 15 : i32} : (memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_39 = air.execute [%17] {
memref.dealloc %results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>
}
air.herd_terminator
}
%15 = air.channel.put async [%5, %6, %14] @channel_7[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) {id = 16 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
%async_token_28 = air.execute [%7] {
memref.dealloc %results_23 : memref<1x1x64x512xi32, 1 : i32>
}
%async_token_29 = air.execute [%8] {
memref.dealloc %results_25 : memref<1x1x512x64xi32, 1 : i32>
}
%async_token_30 = air.execute [%15] {
memref.dealloc %results_27 : memref<1x1x64x64xi32, 1 : i32>
}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before AIRSegmentLoopFusion (air-loop-fusion) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 1 : i32} {
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%async_token_8, %results_9 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
}
%1 = air.channel.put async [%async_token_8] @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<2048x512xi32>)
%async_token_10, %results_11 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
}
%2 = air.channel.put async [%async_token_10] @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<512x2048xi32>)
%async_token_12, %results_13 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
}
%async_token_14, %results_15 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
}
%3 = air.channel.get async [%async_token_12, %async_token_14] @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) {id = 3 : i32} : (memref<2048x2048xi32>)
%4 = air.segment @segment_0 async attributes {id = 2 : i32} {
%c32_16 = arith.constant 32 : index
%c4 = arith.constant 4 : index
%c32768 = arith.constant 32768 : index
%c8 = arith.constant 8 : index
%c4096 = arith.constant 4096 : index
%c2048_17 = arith.constant 2048 : index
%c64_18 = arith.constant 64 : index
%c1_19 = arith.constant 1 : index
%c512_20 = arith.constant 512 : index
%c0_21 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%5 = air.wait_all async
%6 = air.wait_all async
%async_token_22, %results_23 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
}
%7 = air.channel.get async [%5, %async_token_22] @channel_4[] (%results_23[] [] []) {id = 4 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
%async_token_24, %results_25 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
}
%8 = air.channel.get async [%6, %async_token_24] @channel_5[] (%results_25[] [] []) {id = 5 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
%async_token_26, %results_27 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
}
%9 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_22) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg8] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_0[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 6 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%10 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_22) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg8] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_1[] (%results_23[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_32] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) {id = 7 : i32} : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%11 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_24) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg8] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_2[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 8 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%12 = scf.for %arg7 = %c0_21 to %c64_18 step %c4 iter_args(%arg8 = %async_token_24) -> (!air.async.token) {
%async_token_31, %results_32 = air.execute [%arg8] -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg7]
air.execute_terminator %17 : index
}
%16 = air.channel.put async [%async_token_31] @channel_3[] (%results_25[%c0_21, %c0_21, %c0_21, %c0_21, %results_32, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) {id = 9 : i32} : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %16 : !air.async.token
}
%13 = scf.parallel (%arg7, %arg8) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_26) -> !air.async.token {
%async_token_31, %results_32 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg7]
air.execute_terminator %17 : index
}
%async_token_33, %results_34 = air.execute -> (index) {
%17 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg8]
air.execute_terminator %17 : index
}
%16 = air.channel.get async [%async_token_26, %async_token_33, %async_token_31] @channel_6[%arg7, %arg8] (%results_27[%c0_21, %c0_21, %results_32, %results_34] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) {id = 10 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
scf.reduce(%16 : !air.async.token) {
^bb0(%arg9: !air.async.token, %arg10: !air.async.token):
%17 = air.wait_all async [%arg9, %arg10]
scf.reduce.return %17 : !air.async.token
}
}
%14 = air.herd @herd_0 async [%async_token_22, %async_token_24, %async_token_26] tile (%arg7, %arg8) in (%arg9=%c2, %arg10=%c2) attributes {id = 3 : i32} {
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_31 = arith.constant 1 : index
%c8_32 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%c0_33 = arith.constant 0 : index
%c64_34 = arith.constant 64 : index
%c4_35 = arith.constant 4 : index
%async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
}
%async_token_38 = air.execute [%async_token_36] {
linalg.fill ins(%c0_i32 : i32) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>)
}
%16 = scf.for %arg11 = %c0_33 to %c64_34 step %c4_35 iter_args(%arg12 = %async_token_38) -> (!air.async.token) {
%async_token_40, %results_41 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
}
%18 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg7, %arg8] -> !air.async.token {
%20 = air.channel.get async [%async_token_40, %arg12] @channel_0[%arg7, %arg8] (%results_41[] [] []) {id = 11 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_40, %arg12] @channel_1[%arg7, %arg8] (%results_41[] [] []) {id = 12 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_42, %results_43 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%19 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg7, %arg8] -> !air.async.token {
%20 = air.channel.get async [%async_token_42, %arg12] @channel_2[%arg7, %arg8] (%results_43[] [] []) {id = 13 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
} else {
%20 = air.channel.get async [%async_token_42, %arg12] @channel_3[%arg7, %arg8] (%results_43[] [] []) {id = 14 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %20 : !air.async.token
}
%async_token_44 = air.execute [%19, %18] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_41, %results_43 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_47: i32, %out: i32):
%20 = arith.muli %in, %in_47 : i32
%21 = arith.addi %out, %20 : i32
linalg.yield %21 : i32
}
}
%async_token_45 = air.execute [%async_token_44] {
memref.dealloc %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>
}
%async_token_46 = air.execute [%async_token_44] {
memref.dealloc %results_43 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
scf.yield %async_token_44 : !air.async.token
}
%17 = air.channel.put async [%16] @channel_6[%arg7, %arg8] (%results_37[%c0_33, %c0_33, %c0_33, %c0_33, %c0_33, %c0_33] [%c1_31, %c1_31, %c8_32, %c4_35, %c8_32, %c4_35] [%c1024, %c1024, %c16, %c4_35, %c128, %c1_31]) {id = 15 : i32} : (memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_39 = air.execute [%17] {
memref.dealloc %results_37 : memref<1x1x8x8x4x4xi32, 2 : i32>
}
air.herd_terminator
}
%15 = air.channel.put async [%5, %6, %14] @channel_7[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) {id = 16 : i32} : (memref<1x1x64x64xi32, 1 : i32>)
%async_token_28 = air.execute [%7] {
memref.dealloc %results_23 : memref<1x1x64x512xi32, 1 : i32>
}
%async_token_29 = air.execute [%8] {
memref.dealloc %results_25 : memref<1x1x512x64xi32, 1 : i32>
}
%async_token_30 = air.execute [%15] {
memref.dealloc %results_27 : memref<1x1x64x64xi32, 1 : i32>
}
air.segment_terminator
}
air.launch_terminator
}
return
}
<stdin>:20:16: error: operand #0 does not dominate this use
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
^
<stdin>:20:16: note: see current operation: %13 = "air.channel.get"(%22#0, %11, %22#1) <{chan_name = @channel_4, operandSegmentSizes = array<i32: 2, 0, 1, 0, 0, 0>}> {id = 4 : i32} : (!air.async.token, !air.async.token, memref<1x1x64x512xi32, 1 : i32>) -> !air.async.token
<stdin>:20:16: note: operand defined here (op in a child region)
<stdin>:3:5: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
^
<stdin>:3:5: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg0: !hal.device):
%0 = "arith.constant"() <{value = 1 : index}> : () -> index
%1 = "arith.constant"() <{value = 1 : index}> : () -> index
%2 = "arith.constant"() <{value = 1 : index}> : () -> index
"hal.return"(%0, %1, %2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "matmul_large_dispatch_0_matmul_2048x2048x512_i32", translation_info = #iree_codegen.translation_info<None>} : () -> ()
"builtin.module"() ({
"air.channel"() <{size = [1, 1], sym_name = "channel_7"}> : () -> ()
"air.channel"() <{size = [2, 2], sym_name = "channel_6"}> : () -> ()
"air.channel"() <{size = [1, 1], sym_name = "channel_5"}> : () -> ()
"air.channel"() <{size = [1, 1], sym_name = "channel_4"}> : () -> ()
"air.channel"() <{size = [1, 1], sym_name = "channel_3"}> {broadcast_shape = [2, 1]} : () -> ()
"air.channel"() <{size = [1, 1], sym_name = "channel_2"}> {broadcast_shape = [2, 1]} : () -> ()
"air.channel"() <{size = [1, 1], sym_name = "channel_1"}> {broadcast_shape = [1, 2]} : () -> ()
"air.channel"() <{size = [1, 1], sym_name = "channel_0"}> {broadcast_shape = [1, 2]} : () -> ()
"func.func"() <{function_type = () -> (), sym_name = "matmul_large_dispatch_0_matmul_2048x2048x512_i32"}> ({
%0 = "arith.constant"() <{value = 32 : index}> : () -> index
%1 = "arith.constant"() <{value = 0 : index}> : () -> index
%2:2 = "air.execute"() ({
%9 = "hal.interface.binding.subspan"(%1) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<2048x512xi32>
"air.execute_terminator"(%9) : (memref<2048x512xi32>) -> ()
}) : () -> (!air.async.token, memref<2048x512xi32>)
%3 = "air.execute"(%2#0) ({
"memref.assume_alignment"(%2#1) <{alignment = 64 : i32}> : (memref<2048x512xi32>) -> ()
"air.execute_terminator"() : () -> ()
}) : (!air.async.token) -> !air.async.token
%4:2 = "air.execute"() ({
%9 = "hal.interface.binding.subspan"(%1) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<512x2048xi32>
"air.execute_terminator"(%9) : (memref<512x2048xi32>) -> ()
}) : () -> (!air.async.token, memref<512x2048xi32>)
%5 = "air.execute"(%4#0) ({
"memref.assume_alignment"(%4#1) <{alignment = 64 : i32}> : (memref<512x2048xi32>) -> ()
"air.execute_terminator"() : () -> ()
}) : (!air.async.token) -> !air.async.token
%6:2 = "air.execute"() ({
%9 = "hal.interface.binding.subspan"(%1) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<2048x2048xi32>
"air.execute_terminator"(%9) : (memref<2048x2048xi32>) -> ()
}) : () -> (!air.async.token, memref<2048x2048xi32>)
%7 = "air.execute"(%6#0) ({
"memref.assume_alignment"(%6#1) <{alignment = 64 : i32}> : (memref<2048x2048xi32>) -> ()
"air.execute_terminator"() : () -> ()
}) : (!air.async.token) -> !air.async.token
%8 = "air.launch"(%7, %5, %3, %0, %0, %2#1, %4#1, %6#1) <{operandSegmentSizes = array<i32: 3, 2, 3>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: memref<2048x512xi32>, %arg5: memref<512x2048xi32>, %arg6: memref<2048x2048xi32>):
%9 = "arith.constant"() <{value = 2048 : index}> : () -> index
%10 = "arith.constant"() <{value = 64 : index}> : () -> index
%11 = "arith.constant"() <{value = 1 : index}> : () -> index
%12 = "arith.constant"() <{value = 512 : index}> : () -> index
%13 = "arith.constant"() <{value = 0 : index}> : () -> index
%14:2 = "air.execute"() ({
%22 = "affine.apply"(%arg0) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
"air.execute_terminator"(%22) : (index) -> ()
}) : () -> (!air.async.token, index)
%15 = "air.channel.put"(%14#0, %arg4, %14#1, %13, %10, %12, %12, %11) <{chan_name = @channel_4, operandSegmentSizes = array<i32: 1, 0, 1, 2, 2, 2>}> {id = 1 : i32} : (!air.async.token, memref<2048x512xi32>, index, index, index, index, index, index) -> !air.async.token
%16:2 = "air.execute"() ({
%22 = "affine.apply"(%arg1) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
"air.execute_terminator"(%22) : (index) -> ()
}) : () -> (!air.async.token, index)
%17 = "air.channel.put"(%16#0, %arg5, %13, %16#1, %12, %10, %9, %11) <{chan_name = @channel_5, operandSegmentSizes = array<i32: 1, 0, 1, 2, 2, 2>}> {id = 2 : i32} : (!air.async.token, memref<512x2048xi32>, index, index, index, index, index, index) -> !air.async.token
%18:2 = "air.execute"() ({
%22 = "affine.apply"(%arg0) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
"air.execute_terminator"(%22) : (index) -> ()
}) : () -> (!air.async.token, index)
%19:2 = "air.execute"() ({
%22 = "affine.apply"(%arg1) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
"air.execute_terminator"(%22) : (index) -> ()
}) : () -> (!air.async.token, index)
%20 = "air.channel.get"(%19#0, %18#0, %arg6, %18#1, %19#1, %10, %10, %9, %11) <{chan_name = @channel_7, operandSegmentSizes = array<i32: 2, 0, 1, 2, 2, 2>}> {id = 3 : i32} : (!air.async.token, !air.async.token, memref<2048x2048xi32>, index, index, index, index, index, index) -> !air.async.token
%21 = "air.segment"() <{operandSegmentSizes = array<i32: 0, 0, 0>, sym_name = "segment_0"}> ({
%22 = "arith.constant"() <{value = 32 : index}> : () -> index
%23 = "arith.constant"() <{value = 4 : index}> : () -> index
%24 = "arith.constant"() <{value = 32768 : index}> : () -> index
%25 = "arith.constant"() <{value = 8 : index}> : () -> index
%26 = "arith.constant"() <{value = 4096 : index}> : () -> index
%27 = "arith.constant"() <{value = 2048 : index}> : () -> index
%28 = "arith.constant"() <{value = 64 : index}> : () -> index
%29 = "arith.constant"() <{value = 1 : index}> : () -> index
%30 = "arith.constant"() <{value = 512 : index}> : () -> index
%31 = "arith.constant"() <{value = 0 : index}> : () -> index
%32 = "arith.constant"() <{value = 2 : index}> : () -> index
%33 = "air.wait_all"() : () -> !air.async.token
%34 = "air.wait_all"() : () -> !air.async.token
%35 = "air.channel.get"(%44#0, %33, %44#1) <{chan_name = @channel_4, operandSegmentSizes = array<i32: 2, 0, 1, 0, 0, 0>}> {id = 4 : i32} : (!air.async.token, !air.async.token, memref<1x1x64x512xi32, 1 : i32>) -> !air.async.token
%36 = "air.channel.get"(%45#0, %34, %45#1) <{chan_name = @channel_5, operandSegmentSizes = array<i32: 2, 0, 1, 0, 0, 0>}> {id = 5 : i32} : (!air.async.token, !air.async.token, memref<1x1x512x64xi32, 1 : i32>) -> !air.async.token
%37 = "scf.for"(%31, %30, %22, %36) ({
^bb0(%arg7: index, %arg8: !air.async.token):
%44:2 = "air.execute"() ({
%53 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x64x512xi32, 1 : i32>
"air.execute_terminator"(%53) : (memref<1x1x64x512xi32, 1 : i32>) -> ()
}) : () -> (!air.async.token, memref<1x1x64x512xi32, 1 : i32>)
%45:2 = "air.execute"() ({
%53 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x512x64xi32, 1 : i32>
"air.execute_terminator"(%53) : (memref<1x1x512x64xi32, 1 : i32>) -> ()
}) : () -> (!air.async.token, memref<1x1x512x64xi32, 1 : i32>)
%46 = "air.channel.put"(%arg8, %44#1, %31, %31, %31, %31, %31, %arg7, %29, %29, %23, %25, %23, %25, %24, %24, %25, %27, %30, %29) <{chan_name = @channel_0, operandSegmentSizes = array<i32: 1, 0, 1, 6, 6, 6>}> {id = 6 : i32} : (!air.async.token, memref<1x1x64x512xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
%47 = "air.channel.put"(%arg8, %44#1, %31, %31, %31, %31, %22, %arg7, %29, %29, %23, %25, %23, %25, %24, %24, %25, %27, %30, %29) <{chan_name = @channel_1, operandSegmentSizes = array<i32: 1, 0, 1, 6, 6, 6>}> {id = 7 : i32} : (!air.async.token, memref<1x1x64x512xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
%48 = "air.channel.put"(%arg8, %45#1, %31, %31, %31, %31, %arg7, %31, %29, %29, %25, %23, %25, %23, %24, %24, %23, %30, %28, %29) <{chan_name = @channel_2, operandSegmentSizes = array<i32: 1, 0, 1, 6, 6, 6>}> {id = 8 : i32} : (!air.async.token, memref<1x1x512x64xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
%49 = "air.channel.put"(%arg8, %45#1, %31, %31, %31, %31, %arg7, %22, %29, %29, %25, %23, %25, %23, %24, %24, %23, %30, %28, %29) <{chan_name = @channel_3, operandSegmentSizes = array<i32: 1, 0, 1, 6, 6, 6>}> {id = 9 : i32} : (!air.async.token, memref<1x1x512x64xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
%50 = "air.execute"() ({
"memref.dealloc"(%44#1) : (memref<1x1x64x512xi32, 1 : i32>) -> ()
"air.execute_terminator"() : () -> ()
}) : () -> !air.async.token
%51 = "air.execute"() ({
"memref.dealloc"(%45#1) : (memref<1x1x512x64xi32, 1 : i32>) -> ()
"air.execute_terminator"() : () -> ()
}) : () -> !air.async.token
%52 = "air.wait_all"(%46, %47, %48, %49, %50, %51) : (!air.async.token, !air.async.token, !air.async.token, !air.async.token, !air.async.token, !air.async.token) -> !air.async.token
"scf.yield"(%52) : (!air.async.token) -> ()
}) : (index, index, index, !air.async.token) -> !air.async.token
%38:2 = "air.execute"() ({
%44 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x64x64xi32, 1 : i32>
"air.execute_terminator"(%44) : (memref<1x1x64x64xi32, 1 : i32>) -> ()
}) : () -> (!air.async.token, memref<1x1x64x64xi32, 1 : i32>)
%39 = "air.wait_all"() : () -> !air.async.token
%40 = "scf.parallel"(%31, %31, %32, %32, %29, %29, %38#0) <{operandSegmentSizes = array<i32: 2, 2, 2, 1>}> ({
^bb0(%arg7: index, %arg8: index):
%44:2 = "air.execute"() ({
%47 = "affine.apply"(%arg7) <{map = affine_map<()[s0] -> (s0 * 32)>}> : (index) -> index
"air.execute_terminator"(%47) : (index) -> ()
}) : () -> (!air.async.token, index)
%45:2 = "air.execute"() ({
%47 = "affine.apply"(%arg8) <{map = affine_map<()[s0] -> (s0 * 32)>}> : (index) -> index
"air.execute_terminator"(%47) : (index) -> ()
}) : () -> (!air.async.token, index)
%46 = "air.channel.get"(%38#0, %45#0, %44#0, %arg7, %arg8, %38#1, %31, %31, %44#1, %45#1, %29, %29, %22, %22, %26, %26, %28, %29) <{chan_name = @channel_6, operandSegmentSizes = array<i32: 3, 2, 1, 4, 4, 4>}> {id = 10 : i32} : (!air.async.token, !air.async.token, !air.async.token, index, index, memref<1x1x64x64xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
"scf.reduce"(%46) ({
^bb0(%arg9: !air.async.token, %arg10: !air.async.token):
%47 = "air.wait_all"(%arg9, %arg10) : (!air.async.token, !air.async.token) -> !air.async.token
"scf.reduce.return"(%47) : (!air.async.token) -> ()
}) : (!air.async.token) -> ()
}) : (index, index, index, index, index, index, !air.async.token) -> !air.async.token
%41 = "air.herd"(%38#0, %32, %32) <{operandSegmentSizes = array<i32: 1, 2, 0>, sym_name = "herd_0"}> ({
^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index):
%44 = "arith.constant"() <{value = 128 : index}> : () -> index
%45 = "arith.constant"() <{value = 16 : index}> : () -> index
%46 = "arith.constant"() <{value = 1024 : index}> : () -> index
%47 = "arith.constant"() <{value = 1 : index}> : () -> index
%48 = "arith.constant"() <{value = 8 : index}> : () -> index
%49 = "arith.constant"() <{value = 0 : i32}> : () -> i32
%50 = "arith.constant"() <{value = 0 : index}> : () -> index
%51 = "arith.constant"() <{value = 64 : index}> : () -> index
%52 = "arith.constant"() <{value = 4 : index}> : () -> index
%53:2 = "air.execute"() ({
%58 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x8x4x4xi32, 2 : i32>
"air.execute_terminator"(%58) : (memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
}) : () -> (!air.async.token, memref<1x1x8x8x4x4xi32, 2 : i32>)
%54 = "air.execute"(%53#0) ({
"linalg.fill"(%49, %53#1) <{operandSegmentSizes = array<i32: 1, 1>}> ({
^bb0(%arg11: i32, %arg12: i32):
"linalg.yield"(%arg11) : (i32) -> ()
}) : (i32, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
"air.execute_terminator"() : () -> ()
}) : (!air.async.token) -> !air.async.token
%55 = "scf.for"(%50, %51, %52, %54) ({
^bb0(%arg11: index, %arg12: !air.async.token):
%58:2 = "air.execute"() ({
%65 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x4x8x4x8xi32, 2 : i32>
"air.execute_terminator"(%65) : (memref<1x1x4x8x4x8xi32, 2 : i32>) -> ()
}) : () -> (!air.async.token, memref<1x1x4x8x4x8xi32, 2 : i32>)
%59 = "affine.if"(%arg7, %arg8) ({
%65 = "air.channel.get"(%58#0, %arg12, %arg7, %arg8, %58#1) <{chan_name = @channel_0, operandSegmentSizes = array<i32: 2, 2, 1, 0, 0, 0>}> {id = 11 : i32} : (!air.async.token, !air.async.token, index, index, memref<1x1x4x8x4x8xi32, 2 : i32>) -> !air.async.token
"affine.yield"(%65) : (!air.async.token) -> ()
}, {
%65 = "air.channel.get"(%58#0, %arg12, %arg7, %arg8, %58#1) <{chan_name = @channel_1, operandSegmentSizes = array<i32: 2, 2, 1, 0, 0, 0>}> {id = 12 : i32} : (!air.async.token, !air.async.token, index, index, memref<1x1x4x8x4x8xi32, 2 : i32>) -> !air.async.token
"affine.yield"(%65) : (!air.async.token) -> ()
}) {condition = affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>} : (index, index) -> !air.async.token
%60:2 = "air.execute"() ({
%65 = "memref.alloc"() <{operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x4x8x4xi32, 2 : i32>
"air.execute_terminator"(%65) : (memref<1x1x8x4x8x4xi32, 2 : i32>) -> ()
}) : () -> (!air.async.token, memref<1x1x8x4x8x4xi32, 2 : i32>)
%61 = "affine.if"(%arg7, %arg8) ({
%65 = "air.channel.get"(%60#0, %arg12, %arg7, %arg8, %60#1) <{chan_name = @channel_2, operandSegmentSizes = array<i32: 2, 2, 1, 0, 0, 0>}> {id = 13 : i32} : (!air.async.token, !air.async.token, index, index, memref<1x1x8x4x8x4xi32, 2 : i32>) -> !air.async.token
"affine.yield"(%65) : (!air.async.token) -> ()
}, {
%65 = "air.channel.get"(%60#0, %arg12, %arg7, %arg8, %60#1) <{chan_name = @channel_3, operandSegmentSizes = array<i32: 2, 2, 1, 0, 0, 0>}> {id = 14 : i32} : (!air.async.token, !air.async.token, index, index, memref<1x1x8x4x8x4xi32, 2 : i32>) -> !air.async.token
"affine.yield"(%65) : (!air.async.token) -> ()
}) {condition = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>} : (index, index) -> !air.async.token
%62 = "air.execute"(%61, %59) ({
"linalg.generic"(%58#1, %60#1, %53#1) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>], operandSegmentSizes = array<i32: 2, 1>}> ({
^bb0(%arg13: i32, %arg14: i32, %arg15: i32):
%65 = "arith.muli"(%arg13, %arg14) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
%66 = "arith.addi"(%arg15, %65) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
"linalg.yield"(%66) : (i32) -> ()
}) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
"air.execute_terminator"() : () -> ()
}) : (!air.async.token, !air.async.token) -> !air.async.token
%63 = "air.execute"(%62) ({
"memref.dealloc"(%58#1) : (memref<1x1x4x8x4x8xi32, 2 : i32>) -> ()
"air.execute_terminator"() : () -> ()
}) : (!air.async.token) -> !air.async.token
%64 = "air.execute"(%62) ({
"memref.dealloc"(%60#1) : (memref<1x1x8x4x8x4xi32, 2 : i32>) -> ()
"air.execute_terminator"() : () -> ()
}) : (!air.async.token) -> !air.async.token
"scf.yield"(%62) : (!air.async.token) -> ()
}) : (index, index, index, !air.async.token) -> !air.async.token
%56 = "air.channel.put"(%55, %arg7, %arg8, %53#1, %50, %50, %50, %50, %50, %50, %47, %47, %48, %52, %48, %52, %46, %46, %45, %52, %44, %47) <{chan_name = @channel_6, operandSegmentSizes = array<i32: 1, 2, 1, 6, 6, 6>}> {id = 15 : i32} : (!air.async.token, index, index, memref<1x1x8x8x4x4xi32, 2 : i32>, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
%57 = "air.execute"(%56) ({
"memref.dealloc"(%53#1) : (memref<1x1x8x8x4x4xi32, 2 : i32>) -> ()
"air.execute_terminator"() : () -> ()
}) : (!air.async.token) -> !air.async.token
"air.herd_terminator"() : () -> ()
}) {id = 3 : i32} : (!air.async.token, index, index) -> !air.async.token
%42 = "air.channel.put"(%41, %34, %33, %38#1, %31, %31, %31, %31, %29, %29, %28, %28, %26, %26, %28, %29) <{chan_name = @channel_7, operandSegmentSizes = array<i32: 3, 0, 1, 4, 4, 4>}> {id = 16 : i32} : (!air.async.token, !air.async.token, !air.async.token, memref<1x1x64x64xi32, 1 : i32>, index, index, index, index, index, index, index, index, index, index, index, index) -> !air.async.token
%43 = "air.execute"(%42) ({
"memref.dealloc"(%38#1) : (memref<1x1x64x64xi32, 1 : i32>) -> ()
"air.execute_terminator"() : () -> ()
}) : (!air.async.token) -> !air.async.token
"air.segment_terminator"() : () -> ()
}) {id = 2 : i32} : () -> !air.async.token
"air.launch_terminator"() : () -> ()
}) {id = 1 : i32} : (!air.async.token, !air.async.token, !air.async.token, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32>) -> !air.async.token
"func.return"() : () -> ()
}) : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "amdaie_xclbin_fb", sym_visibility = "public", target = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>} : () -> ()
This file has been truncated, but you can view the full file.
// -----// IR Dump Before TranslateTargetExecutableVariantsPass (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
}
// -----// IR Dump Before TypePropagation (iree-codegen-type-propagation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before BubbleUpOrdinalOps (iree-codegen-bubble-up-ordinal-ops) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before DecomposeSoftmax (iree-codegen-decompose-softmax) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before MaterializeUserConfigs (iree-codegen-materialize-user-configs) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
}
// -----// IR Dump Before AMDAIELoweringStrategy (iree-amdaie-lowering-strategy) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
}
// -----// IR Dump Before AMDAIELowerExecutableTarget (iree-amdaie-lower-executable-target) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%3, %4 : tensor<2048x512xi32>, tensor<512x2048xi32>) outs(%6 : tensor<2048x2048xi32>) -> tensor<2048x2048xi32>
%8 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%9 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%10 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%9 : tensor<64x64xi32>) -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %10 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} ins(%extracted_slice, %extracted_slice_0 : tensor<64x512xi32>, tensor<512x64xi32>) outs(%7 : tensor<64x64xi32>) -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = tensor.empty() : tensor<1x1x64x512xi32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %8 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%9 = tensor.empty() : tensor<1x1x64x512xi32>
%10 = tensor.empty() : tensor<1x1x512x64xi32>
%pack_2 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %10 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%11 = tensor.empty() : tensor<1x1x64x64xi32>
%pack_3 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %11 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_2 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_3 : tensor<1x1x64x64xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_4: i32, %out: i32):
%13 = arith.muli %in, %in_4 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x64x64xi32>
%unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = tensor.empty() : tensor<1x1x64x512xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %9 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%10 = tensor.empty() : tensor<1x1x64x512xi32>
%11 = tensor.empty() : tensor<1x1x512x64xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %12 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%13 = tensor.empty() : tensor<1x1x64x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%14 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%pack_5 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_3 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_5 : tensor<1x1x64x64xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_6: i32, %out: i32):
%16 = arith.muli %in, %in_6 : i32
%17 = arith.addi %out, %16 : i32
linalg.yield %17 : i32
} -> tensor<1x1x64x64xi32>
%unpack = tensor.unpack %15 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%8 = tensor.empty() : tensor<1x1x64x512xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %9 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%10 = tensor.empty() : tensor<1x1x64x512xi32>
%11 = tensor.empty() : tensor<1x1x512x64xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%12 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %12 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%13 = tensor.empty() : tensor<1x1x64x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%14 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%pack_5 = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %14 : tensor<64x64xi32> -> tensor<1x1x64x64xi32>
%15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack, %pack_3 : tensor<1x1x64x512xi32>, tensor<1x1x512x64xi32>) outs(%pack_5 : tensor<1x1x64x64xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_6: i32, %out: i32):
%17 = arith.muli %in, %in_6 : i32
%18 = arith.addi %out, %17 : i32
linalg.yield %18 : i32
} -> tensor<1x1x64x64xi32>
%16 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %pack_5) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_6 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_7 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_8 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_6, %extracted_slice_7 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_8 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_9: i32, %out: i32):
%18 = arith.muli %in, %in_9 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %17 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %16 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%7 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_1 : tensor<64x64xi32>) -> tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %8 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %9 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%10 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
%12 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %11) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%14 = arith.muli %in, %in_8 : i32
%15 = arith.addi %out, %14 : i32
linalg.yield %15 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %12 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %7 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
%11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %10) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEFuseFillIntoForall (iree-amdaie-fuse-fill-into-forall) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
%11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %10) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = linalg.fill ins(%c0_i32 : i32) outs(%9 : tensor<1x1x64x64xi32>) -> tensor<1x1x64x64xi32>
%11 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%extracted_slice_8 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%12 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_9: i32, %out: i32):
%14 = arith.muli %in, %in_9 : i32
%15 = arith.addi %out, %14 : i32
linalg.yield %15 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %11 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEPackAndTranspose (iree-amdaie-pack-and-transpose) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x1x32x512xi32>, tensor<1x1x512x32xi32>) outs(%11 : tensor<1x1x32x32xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_8: i32, %out: i32):
%13 = arith.muli %in, %in_8 : i32
%14 = arith.addi %out, %13 : i32
linalg.yield %14 : i32
} -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
%13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%pack_10 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_10 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%19 = arith.muli %in, %in_12 : i32
%20 = arith.addi %out, %19 : i32
linalg.yield %20 : i32
} -> tensor<1x1x8x8x4x4xi32>
%unpack_11 = tensor.unpack %18 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIETileAndFuse (iree-amdaie-tile-and-fuse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
%13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%18 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%pack_11 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_11 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_13: i32, %out: i32):
%20 = arith.muli %in, %in_13 : i32
%21 = arith.addi %out, %20 : i32
linalg.yield %21 : i32
} -> tensor<1x1x8x8x4x4xi32>
%unpack_12 = tensor.unpack %19 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_12 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = tensor.empty() : tensor<1x1x8x64x4x8xi32>
%13 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %13 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%14 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%15 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %15 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%16 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%17 = tensor.empty() : tensor<1x1x8x8x4x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%18 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%pack_11 = tensor.pack %11 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x8x4x4xi32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_8, %pack_9 : tensor<1x1x64x8x4x8xi32>, tensor<1x1x8x64x8x4xi32>) outs(%pack_11 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%21 = arith.muli %in, %in_14 : i32
%22 = arith.addi %out, %21 : i32
linalg.yield %22 : i32
} -> tensor<1x1x8x8x4x4xi32>
%c0_12 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%20 = scf.for %arg6 = %c0_12 to %c64 step %c4 iter_args(%arg7 = %pack_11) -> (tensor<1x1x8x8x4x4xi32>) {
%extracted_slice_14 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%extracted_slice_15 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%extracted_slice_16 = tensor.extract_slice %arg7[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> to tensor<1x1x8x8x4x4xi32>
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_14, %extracted_slice_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%extracted_slice_16 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_17: i32, %out: i32):
%22 = arith.muli %in, %in_17 : i32
%23 = arith.addi %out, %22 : i32
linalg.yield %23 : i32
} -> tensor<1x1x8x8x4x4xi32>
%inserted_slice = tensor.insert_slice %21 into %arg7[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x8x4x4xi32> into tensor<1x1x8x8x4x4xi32>
scf.yield %inserted_slice : tensor<1x1x8x8x4x4xi32>
}
%unpack_13 = tensor.unpack %20 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_13 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_7 : tensor<1x1x32x32xi32>) -> tensor<1x1x32x32xi32>
%12 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %12 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%13 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %13 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%15 = linalg.fill ins(%c0_i32 : i32) outs(%14 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%16 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %15) -> (tensor<1x1x8x8x4x4xi32>) {
%extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%18 = arith.muli %in, %in_14 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_11 = tensor.unpack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %11 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%17 = arith.muli %in, %in_14 : i32
%18 = arith.addi %out, %17 : i32
linalg.yield %18 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %16 : tensor<1x1x8x8x4x4xi32>
}
%unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEFusePackIntoForLoop (iree-amdaie-fuse-pack-into-for) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%extracted_slice_12 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%extracted_slice_12, %extracted_slice_13 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%17 = arith.muli %in, %in_14 : i32
%18 = arith.addi %out, %17 : i32
linalg.yield %18 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %16 : tensor<1x1x8x8x4x4xi32>
}
%unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIECleanup (iree-amdaie-cleanup) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%pack_8 = tensor.pack %extracted_slice_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %11 : tensor<1x1x32x512xi32> -> tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%pack_9 = tensor.pack %extracted_slice_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %12 : tensor<1x1x512x32xi32> -> tensor<1x1x8x64x8x4xi32>
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_10 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_12 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_13 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%pack_14 = tensor.pack %extracted_slice_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_13 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_15 = tensor.extract_slice %pack_8[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_16 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_17 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%pack_18 = tensor.pack %extracted_slice_16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_17 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%extracted_slice_19 = tensor.extract_slice %pack_9[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_14, %pack_18 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_20: i32, %out: i32):
%19 = arith.muli %in, %in_20 : i32
%20 = arith.addi %out, %19 : i32
linalg.yield %20 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %18 : tensor<1x1x8x8x4x4xi32>
}
%unpack_11 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_11 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%19 = arith.muli %in, %in_16 : i32
%20 = arith.addi %out, %19 : i32
linalg.yield %20 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %18 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%17 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %17, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%19 = arith.muli %in, %in_16 : i32
%20 = arith.addi %out, %19 : i32
linalg.yield %20 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %18 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before AMDAIEBufferizeToAllocation (iree-amdaie-bufferize-to-allocation) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %extracted_slice_11 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_14 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %extracted_slice_14 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%18 = arith.muli %in, %in_16 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before AMDAIELowerToUKernels (iree-amdaie-lower-to-ukernels) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%11 = tensor.empty() : tensor<1x1x64x8x4x8xi32>
%12 = tensor.empty() : tensor<1x1x8x64x8x4xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%13 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%14 = linalg.fill ins(%c0_i32 : i32) outs(%13 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%15 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %14) -> (tensor<1x1x8x8x4x4xi32>) {
%16 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %16] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%extracted_slice_11 = tensor.extract_slice %11[0, 0, %arg6, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1, 1, 1, 1, 1, 1] : tensor<1x1x64x8x4x8xi32> to tensor<1x1x4x8x4x8xi32>
%alloc_12 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%17 = bufferization.to_tensor %alloc_12 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
%pack_13 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %17 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_14 = tensor.extract_slice %extracted_slice_6[0, 0, %16, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%extracted_slice_15 = tensor.extract_slice %12[0, 0, 0, %arg6, 0, 0] [1, 1, 8, 4, 8, 4] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x64x8x4xi32> to tensor<1x1x8x4x8x4xi32>
%alloc_16 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%18 = bufferization.to_tensor %alloc_16 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
%pack_17 = tensor.pack %extracted_slice_14 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %18 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_13, %pack_17 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_18: i32, %out: i32):
%20 = arith.muli %in, %in_18 : i32
%21 = arith.addi %out, %20 : i32
linalg.yield %21 : i32
} -> tensor<1x1x8x8x4x4xi32>
memref.dealloc %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_16 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %19 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %15 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
// -----// IR Dump Before EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = tensor.empty() : tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
%14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%18 = arith.muli %in, %in_16 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>> -> tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
%14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%18 = arith.muli %in, %in_16 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x512xi32>> -> tensor<2048x512xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi32>> -> tensor<512x2048xi32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>> -> tensor<2048x2048xi32>
%6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) shared_outs(%arg2 = %5) -> (tensor<2048x2048xi32>) {
%extracted_slice = tensor.extract_slice %3[%arg0, 0] [64, 512] [1, 1] : tensor<2048x512xi32> to tensor<64x512xi32>
%extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [512, 64] [1, 1] : tensor<512x2048xi32> to tensor<512x64xi32>
%extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<2048x2048xi32> to tensor<64x64xi32>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%7 = bufferization.to_tensor %alloc restrict writable : memref<1x1x64x512xi32, 1 : i32>
%pack = tensor.pack %extracted_slice inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %7 : tensor<64x512xi32> -> tensor<1x1x64x512xi32>
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%8 = bufferization.to_tensor %alloc_2 restrict writable : memref<1x1x512x64xi32, 1 : i32>
%pack_3 = tensor.pack %extracted_slice_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %8 : tensor<512x64xi32> -> tensor<1x1x512x64xi32>
%alloc_4 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%9 = bufferization.to_tensor %alloc_4 restrict writable : memref<1x1x64x64xi32, 1 : i32>
%10 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 64) step (32, 32) shared_outs(%arg5 = %9) -> (tensor<1x1x64x64xi32>) {
%extracted_slice_5 = tensor.extract_slice %pack[0, 0, %arg3, 0] [1, 1, 32, 512] [1, 1, 1, 1] : tensor<1x1x64x512xi32> to tensor<1x1x32x512xi32>
%extracted_slice_6 = tensor.extract_slice %pack_3[0, 0, 0, %arg4] [1, 1, 512, 32] [1, 1, 1, 1] : tensor<1x1x512x64xi32> to tensor<1x1x512x32xi32>
%extracted_slice_7 = tensor.extract_slice %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x64x64xi32> to tensor<1x1x32x32xi32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
%11 = bufferization.to_tensor %alloc_8 restrict writable : memref<1x1x8x8x4x4xi32, 2 : i32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x1x8x8x4x4xi32>) -> tensor<1x1x8x8x4x4xi32>
%13 = scf.for %arg6 = %c0 to %c64 step %c4 iter_args(%arg7 = %12) -> (tensor<1x1x8x8x4x4xi32>) {
%14 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg6)
%extracted_slice_10 = tensor.extract_slice %extracted_slice_5[0, 0, 0, %14] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x512xi32> to tensor<1x1x32x32xi32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%15 = bufferization.to_tensor %alloc_11 restrict writable : memref<1x1x4x8x4x8xi32, 2 : i32>
%pack_12 = tensor.pack %extracted_slice_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %15 : tensor<1x1x32x32xi32> -> tensor<1x1x4x8x4x8xi32>
%extracted_slice_13 = tensor.extract_slice %extracted_slice_6[0, 0, %14, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x512x32xi32> to tensor<1x1x32x32xi32>
%alloc_14 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%16 = bufferization.to_tensor %alloc_14 restrict writable : memref<1x1x8x4x8x4xi32, 2 : i32>
%pack_15 = tensor.pack %extracted_slice_13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %16 : tensor<1x1x32x32xi32> -> tensor<1x1x8x4x8x4xi32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%pack_12, %pack_15 : tensor<1x1x4x8x4x8xi32>, tensor<1x1x8x4x8x4xi32>) outs(%arg7 : tensor<1x1x8x8x4x4xi32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_16: i32, %out: i32):
%18 = arith.muli %in, %in_16 : i32
%19 = arith.addi %out, %18 : i32
linalg.yield %19 : i32
} -> tensor<1x1x8x8x4x4xi32>
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_14 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %17 : tensor<1x1x8x8x4x4xi32>
}
%unpack_9 = tensor.unpack %13 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %extracted_slice_7 : tensor<1x1x8x8x4x4xi32> -> tensor<1x1x32x32xi32>
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack_9 into %arg5[0, 0, %arg3, %arg4] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<1x1x32x32xi32> into tensor<1x1x64x64xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %extracted_slice_1 : tensor<1x1x64x64xi32> -> tensor<64x64xi32>
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x64x64xi32, 1 : i32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %unpack into %arg2[%arg0, %arg1] [64, 64] [1, 1] : tensor<64x64xi32> into tensor<2048x2048xi32>
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xi32>>
return
}
}
// -----// IR Dump Before ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
%3 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %alloc_8) -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_10 = memref.subview %subview_5[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_12 = memref.subview %subview_6[0, 0, %4, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%5 = arith.muli %in, %in_14 : i32
%6 = arith.addi %out, %5 : i32
linalg.yield %6 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %3 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
%subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
%subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
%3 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %alloc_8) -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_10 = memref.subview %subview_5[0, 0, 0, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_12 = memref.subview %subview_6[0, 0, %4, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%5 = arith.muli %in, %in_14 : i32
%6 = arith.addi %out, %5 : i32
linalg.yield %6 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
scf.yield %arg5 : memref<1x1x8x8x4x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %3 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
%subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
%subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) outs(%2 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
return
}
// -----// IR Dump Before CSE (cse) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_5 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_7 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_8 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_10 = memref.subview %subview_5[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_12 = memref.subview %subview_6[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_13 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_12 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_13 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_13 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_14: i32, %out: i32):
%4 = arith.muli %in, %in_14 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_13 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_7 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_8 : memref<1x1x8x8x4x4xi32, 2 : i32>
%subview_9 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_7 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_9 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
%subview_4 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%4 = arith.muli %in, %in_12 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%subview_6 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) outs(%subview_6 : memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: i32, %out: i32):
linalg.yield %in : i32
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
// -----// IR Dump Before CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%4 = arith.muli %in, %in_12 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
// -----// IR Dump Before LowerUKernelOpsToCalls (iree-codegen-lower-ukernel-ops-to-calls) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%4 = arith.muli %in, %in_12 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc[0, 0, %arg2, 0] [1, 1, 32, 512] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%subview_5 = memref.subview %alloc_2[0, 0, 0, %arg3] [1, 1, 512, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%subview_6 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %subview_4[0, 0, 0, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x32x512xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_10 = memref.subview %subview_5[0, 0, %3, 0] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_11 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_10 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_11 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_12: i32, %out: i32):
%4 = arith.muli %in, %in_12 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_11 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_7 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_6 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_7 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %alloc_2[0, 0, %4, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%5 = arith.muli %in, %in_10 : i32
%6 = arith.addi %out, %5 : i32
linalg.yield %6 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
}
// -----// IR Dump Before CSE (cse) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_8 = memref.subview %alloc_2[0, 0, %4, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%5 = arith.muli %in, %in_10 : i32
%6 = arith.addi %out, %5 : i32
linalg.yield %6 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
}
// -----// IR Dump Before AMDAIELowerWorkgroupCount (iree-amdaie-lower-workgroup-count) //----- //
hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
hal.executable.export public @matmul_large_dispatch_0_matmul_2048x2048x512_i32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<None>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
}
// -----// IR Dump Before EraseHALDescriptorTypeFromMemRef (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %0, 64 : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %1, 64 : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %2, 64 : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32, #hal.descriptor_type<storage_buffer>> to memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32, #hal.descriptor_type<storage_buffer>> to memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before AMDAIEBridgeToAIR (iree-amdaie-bridge-to-air) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.forall (%arg0, %arg1) = (0, 0) to (2048, 2048) step (64, 64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.forall (%arg2, %arg3) = (0, 0) to (64, 64) step (32, 32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}
// -----// IR Dump Before AMDAIEDecomposeLinalgExtPackUnPackToAIR (iree-amdaie-decompose-pack-unpack-to-air) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c2048 = arith.constant 2048 : index
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
iree_linalg_ext.pack %subview inner_dims_pos = [0, 1] inner_tiles = [64, 512] into %alloc : (memref<64x512xi32, strided<[512, 1], offset: ?>> memref<1x1x64x512xi32, 1 : i32>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
iree_linalg_ext.pack %subview_0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [512, 64] into %alloc_2 : (memref<512x64xi32, strided<[2048, 1], offset: ?>> memref<1x1x512x64xi32, 1 : i32>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) {
%subview_4 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_5 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg4]
%subview_6 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_7 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
iree_linalg_ext.pack %subview_6 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %alloc_7 : (memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> memref<1x1x4x8x4x8xi32, 2 : i32>)
%subview_8 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
iree_linalg_ext.pack %subview_8 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [8, 4] into %alloc_9 : (memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> memref<1x1x8x4x8x4xi32, 2 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_7, %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_10: i32, %out: i32):
%4 = arith.muli %in, %in_10 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_7 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_9 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
iree_linalg_ext.unpack %alloc_5 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 4] into %subview_4 : (memref<1x1x8x8x4x4xi32, 2 : i32> memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>)
memref.dealloc %alloc_5 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.reduce
}
iree_linalg_ext.unpack %alloc_3 inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %subview_1 : (memref<1x1x64x64xi32, 1 : i32> memref<64x64xi32, strided<[2048, 1], offset: ?>>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
scf.reduce
}
return
}
}
// -----// IR Dump Before ParallelToHerd (air-par-to-herd) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c2048 = arith.constant 2048 : index
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_2[] [] [], %subview_0[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c64, %c64) step (%c32, %c32) {
%subview_5 = memref.subview %alloc_3[0, 0, %arg2, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_6 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg4 = %c0 to %c64 step %c4 {
%3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg4]
%subview_8 = memref.subview %alloc[0, 0, %arg2, %3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_9 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%expand_shape = memref.expand_shape %subview_8 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
%transpose_10 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_9[] [] [], %transpose_10[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
%subview_11 = memref.subview %alloc_2[0, 0, %3, %arg3] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%expand_shape_13 = memref.expand_shape %subview_11 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
%transpose_14 = memref.transpose %expand_shape_13 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_12[] [] [], %transpose_14[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_9, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_15: i32, %out: i32):
%4 = arith.muli %in, %in_15 : i32
%5 = arith.addi %out, %4 : i32
linalg.yield %5 : i32
}
memref.dealloc %alloc_9 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%transpose_7 = memref.transpose %alloc_6 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
air.dma_memcpy_nd (%subview_5[] [] [], %transpose_7[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
memref.dealloc %alloc_6 : memref<1x1x8x8x4x4xi32, 2 : i32>
scf.reduce
}
%subview_4 = memref.subview %alloc_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
%transpose = memref.transpose %subview_4 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
air.dma_memcpy_nd (%subview_1[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
scf.reduce
}
return
}
}
// -----// IR Dump Before ParallelToLaunch (air-par-to-launch) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c2048 = arith.constant 2048 : index
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c2048, %c2048) step (%c64, %c64) {
%subview = memref.subview %0[%arg0, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %1[0, %arg1] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
%alloc_2 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_2[] [] [], %subview_0[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
%alloc_3 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%c2 = arith.constant 2 : index
%c0_4 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2_5 = arith.constant 2 : index
%c0_6 = arith.constant 0 : index
%c1_7 = arith.constant 1 : index
%c2_8 = arith.constant 2 : index
%c2_9 = arith.constant 2 : index
air.herd @herd_0 tile (%arg2, %arg3) in (%arg4=%c2_8, %arg5=%c2_9) args(%arg6=%alloc_3, %arg7=%alloc, %arg8=%alloc_2) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c0_i32_11 = arith.constant 0 : i32
%c0_12 = arith.constant 0 : index
%c64_13 = arith.constant 64 : index
%c4_14 = arith.constant 4 : index
%3 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg2)
%4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg3)
%subview_15 = memref.subview %arg6[0, 0, %3, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_16 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32_11 : i32) outs(%alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg9 = %c0_12 to %c64_13 step %c4_14 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
%subview_18 = memref.subview %arg7[0, 0, %3, %5] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_19 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%expand_shape = memref.expand_shape %subview_18 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
%transpose_20 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_19[] [] [], %transpose_20[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
%subview_21 = memref.subview %arg8[0, 0, %5, %4] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_22 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%expand_shape_23 = memref.expand_shape %subview_21 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
%transpose_24 = memref.transpose %expand_shape_23 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_22[] [] [], %transpose_24[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_19, %alloc_22 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_25: i32, %out: i32):
%6 = arith.muli %in, %in_25 : i32
%7 = arith.addi %out, %6 : i32
linalg.yield %7 : i32
}
memref.dealloc %alloc_19 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_22 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%transpose_17 = memref.transpose %alloc_16 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
air.dma_memcpy_nd (%subview_15[] [] [], %transpose_17[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
memref.dealloc %alloc_16 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
%subview_10 = memref.subview %alloc_3[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
%transpose = memref.transpose %subview_10 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
air.dma_memcpy_nd (%subview_1[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_3 : memref<1x1x64x64xi32, 1 : i32>
scf.reduce
}
return
}
}
// -----// IR Dump Before CopyToDma (air-copy-to-dma) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c2048 = arith.constant 2048 : index
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
%c32_0 = arith.constant 32 : index
%c0_1 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_2 = arith.constant 32 : index
%c0_3 = arith.constant 0 : index
%c1_4 = arith.constant 1 : index
%c32_5 = arith.constant 32 : index
%c32_6 = arith.constant 32 : index
air.launch (%arg0, %arg1) in (%arg2=%c32_5, %arg3=%c32_6) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
air.segment @segment_0 args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg2, %arg10=%arg3, %arg11=%arg4, %arg12=%arg5, %arg13=%arg6) : index, index, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
%3 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg7)
%4 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg8)
%subview = memref.subview %arg11[%3, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_7 = memref.subview %arg12[0, %4] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_8 = memref.subview %arg13[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %subview[] [] []) : (memref<1x1x64x512xi32, 1 : i32>, memref<64x512xi32, strided<[512, 1], offset: ?>>)
%alloc_9 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_9[] [] [], %subview_7[] [] []) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x64xi32, strided<[2048, 1], offset: ?>>)
%alloc_10 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
%c2 = arith.constant 2 : index
%c0_11 = arith.constant 0 : index
%c1_12 = arith.constant 1 : index
%c2_13 = arith.constant 2 : index
%c0_14 = arith.constant 0 : index
%c1_15 = arith.constant 1 : index
%c2_16 = arith.constant 2 : index
%c2_17 = arith.constant 2 : index
air.herd @herd_0 tile (%arg14, %arg15) in (%arg16=%c2_16, %arg17=%c2_17) args(%arg18=%alloc_10, %arg19=%alloc, %arg20=%alloc_9) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c0_i32_19 = arith.constant 0 : i32
%c0_20 = arith.constant 0 : index
%c64_21 = arith.constant 64 : index
%c4_22 = arith.constant 4 : index
%5 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg14)
%6 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg15)
%subview_23 = memref.subview %arg18[0, 0, %5, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_24 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32_19 : i32) outs(%alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg21 = %c0_20 to %c64_21 step %c4_22 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg21]
%subview_26 = memref.subview %arg19[0, 0, %5, %7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_27 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%expand_shape = memref.expand_shape %subview_26 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
%transpose_28 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_27[] [] [], %transpose_28[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>)
%subview_29 = memref.subview %arg20[0, 0, %7, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_30 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%expand_shape_31 = memref.expand_shape %subview_29 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
%transpose_32 = memref.transpose %expand_shape_31 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
air.dma_memcpy_nd (%alloc_30[] [] [], %transpose_32[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_27, %alloc_30 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_33: i32, %out: i32):
%8 = arith.muli %in, %in_33 : i32
%9 = arith.addi %out, %8 : i32
linalg.yield %9 : i32
}
memref.dealloc %alloc_27 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_30 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%transpose_25 = memref.transpose %alloc_24 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
air.dma_memcpy_nd (%subview_23[] [] [], %transpose_25[] [] []) : (memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>, memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>)
memref.dealloc %alloc_24 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
%subview_18 = memref.subview %alloc_10[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
%transpose = memref.transpose %subview_18 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
air.dma_memcpy_nd (%subview_8[] [] [], %transpose[] [] []) : (memref<64x64xi32, strided<[2048, 1], offset: ?>>, memref<64x64xi32, strided<[64, 1]>, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_9 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_10 : memref<1x1x64x64xi32, 1 : i32>
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
air.segment @segment_0 args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg2, %arg10=%arg3, %arg11=%arg4, %arg12=%arg5, %arg13=%arg6) : index, index, index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
%c2 = arith.constant 2 : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
%subview = memref.subview %arg11[%3, 0] [64, 512] [1, 1] : memref<2048x512xi32> to memref<64x512xi32, strided<[512, 1], offset: ?>>
%subview_0 = memref.subview %arg12[0, %4] [512, 64] [1, 1] : memref<512x2048xi32> to memref<512x64xi32, strided<[2048, 1], offset: ?>>
%subview_1 = memref.subview %arg13[%3, %4] [64, 64] [1, 1] : memref<2048x2048xi32> to memref<64x64xi32, strided<[2048, 1], offset: ?>>
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
%c0_2 = arith.constant 0 : index
%c512 = arith.constant 512 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c512_3 = arith.constant 512 : index
air.dma_memcpy_nd (%alloc[] [] [], %arg11[%3, %c0_2] [%c64, %c512_3] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%alloc_4 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
%c0_5 = arith.constant 0 : index
%c2048 = arith.constant 2048 : index
%c1_6 = arith.constant 1 : index
%c512_7 = arith.constant 512 : index
%c64_8 = arith.constant 64 : index
air.dma_memcpy_nd (%alloc_4[] [] [], %arg12[%c0_5, %4] [%c512_7, %c64_8] [%c2048, %c1_6]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%alloc_9 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.herd @herd_0 tile (%arg14, %arg15) in (%arg16=%c2, %arg17=%c2) args(%arg18=%alloc_9, %arg19=%alloc, %arg20=%alloc_4) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c0_i32 = arith.constant 0 : i32
%c0_26 = arith.constant 0 : index
%c64_27 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg14]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg15]
%subview_28 = memref.subview %arg18[0, 0, %5, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[4096, 4096, 64, 1], offset: ?>, 1 : i32>
%alloc_29 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg21 = %c0_26 to %c64_27 step %c4 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg21]
%subview_55 = memref.subview %arg19[0, 0, %5, %7] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x64x512xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32>
%alloc_56 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
%expand_shape = memref.expand_shape %subview_55 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 512, 1], offset: ?>, 1 : i32> into memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32>
%transpose_57 = memref.transpose %expand_shape (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x8x4x4x8xi32, strided<[32768, 32768, 2048, 512, 8, 1], offset: ?>, 1 : i32> to memref<1x1x4x8x4x8xi32, strided<[32768, 32768, 8, 2048, 512, 1], offset: ?>, 1 : i32>
%c0_58 = arith.constant 0 : index
%c0_59 = arith.constant 0 : index
%c32768 = arith.constant 32768 : index
%c32768_60 = arith.constant 32768 : index
%c8_61 = arith.constant 8 : index
%c2048_62 = arith.constant 2048 : index
%c512_63 = arith.constant 512 : index
%c1_64 = arith.constant 1 : index
%c1_65 = arith.constant 1 : index
%c1_66 = arith.constant 1 : index
%c4_67 = arith.constant 4 : index
%c8_68 = arith.constant 8 : index
%c4_69 = arith.constant 4 : index
%c8_70 = arith.constant 8 : index
%c0_71 = arith.constant 0 : index
%c0_72 = arith.constant 0 : index
air.dma_memcpy_nd (%alloc_56[] [] [], %arg19[%c0_72, %c0_71, %c0_58, %c0_59, %5, %7] [%c1_65, %c1_66, %c4_67, %c8_68, %c4_69, %c8_70] [%c32768, %c32768_60, %c8_61, %c2048_62, %c512_63, %c1_64]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%subview_73 = memref.subview %arg20[0, 0, %7, %6] [1, 1, 32, 32] [1, 1, 1, 1] : memref<1x1x512x64xi32, 1 : i32> to memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32>
%alloc_74 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
%expand_shape_75 = memref.expand_shape %subview_73 [[0], [1], [2, 3], [4, 5]] : memref<1x1x32x32xi32, strided<[32768, 32768, 64, 1], offset: ?>, 1 : i32> into memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32>
%transpose_76 = memref.transpose %expand_shape_75 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d2, d3, d5) : memref<1x1x4x8x8x4xi32, strided<[32768, 32768, 512, 64, 4, 1], offset: ?>, 1 : i32> to memref<1x1x8x4x8x4xi32, strided<[32768, 32768, 4, 512, 64, 1], offset: ?>, 1 : i32>
%c0_77 = arith.constant 0 : index
%c0_78 = arith.constant 0 : index
%c32768_79 = arith.constant 32768 : index
%c32768_80 = arith.constant 32768 : index
%c4_81 = arith.constant 4 : index
%c512_82 = arith.constant 512 : index
%c64_83 = arith.constant 64 : index
%c1_84 = arith.constant 1 : index
%c1_85 = arith.constant 1 : index
%c1_86 = arith.constant 1 : index
%c8_87 = arith.constant 8 : index
%c4_88 = arith.constant 4 : index
%c8_89 = arith.constant 8 : index
%c4_90 = arith.constant 4 : index
%c0_91 = arith.constant 0 : index
%c0_92 = arith.constant 0 : index
air.dma_memcpy_nd (%alloc_74[] [] [], %arg20[%c0_92, %c0_91, %c0_77, %c0_78, %7, %6] [%c1_85, %c1_86, %c8_87, %c4_88, %c8_89, %c4_90] [%c32768_79, %c32768_80, %c4_81, %c512_82, %c64_83, %c1_84]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_56, %alloc_74 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_93: i32, %out: i32):
%8 = arith.muli %in, %in_93 : i32
%9 = arith.addi %out, %8 : i32
linalg.yield %9 : i32
}
memref.dealloc %alloc_56 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_74 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
%transpose_30 = memref.transpose %alloc_29 (d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4, d2, d5) : memref<1x1x8x8x4x4xi32, 2 : i32> to memref<1x1x8x4x8x4xi32, strided<[1024, 1024, 16, 4, 128, 1]>, 2 : i32>
%c0_31 = arith.constant 0 : index
%c0_32 = arith.constant 0 : index
%c1024 = arith.constant 1024 : index
%c1024_33 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%c4_34 = arith.constant 4 : index
%c128 = arith.constant 128 : index
%c1_35 = arith.constant 1 : index
%c1_36 = arith.constant 1 : index
%c1_37 = arith.constant 1 : index
%c8 = arith.constant 8 : index
%c4_38 = arith.constant 4 : index
%c8_39 = arith.constant 8 : index
%c4_40 = arith.constant 4 : index
%c4096_41 = arith.constant 4096 : index
%c4096_42 = arith.constant 4096 : index
%c64_43 = arith.constant 64 : index
%c1_44 = arith.constant 1 : index
%c1_45 = arith.constant 1 : index
%c1_46 = arith.constant 1 : index
%c32_47 = arith.constant 32 : index
%c32_48 = arith.constant 32 : index
%c0_49 = arith.constant 0 : index
%c0_50 = arith.constant 0 : index
%c0_51 = arith.constant 0 : index
%c0_52 = arith.constant 0 : index
%c0_53 = arith.constant 0 : index
%c0_54 = arith.constant 0 : index
air.dma_memcpy_nd (%arg18[%c0_31, %c0_32, %5, %6] [%c1_45, %c1_46, %c32_47, %c32_48] [%c4096_41, %c4096_42, %c64_43, %c1_44], %alloc_29[%c0_54, %c0_53, %c0_52, %c0_51, %c0_50, %c0_49] [%c1_36, %c1_37, %c8, %c4_38, %c8_39, %c4_40] [%c1024, %c1024_33, %c16, %c4_34, %c128, %c1_35]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
memref.dealloc %alloc_29 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
%subview_10 = memref.subview %alloc_9[0, 0, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<1x1x64x64xi32, 1 : i32> to memref<64x64xi32, 1 : i32>
%transpose = memref.transpose %subview_10 (d0, d1) -> (d0, d1) : memref<64x64xi32, 1 : i32> to memref<64x64xi32, strided<[64, 1]>, 1 : i32>
%c0_11 = arith.constant 0 : index
%c0_12 = arith.constant 0 : index
%c0_13 = arith.constant 0 : index
%c0_14 = arith.constant 0 : index
%c64_15 = arith.constant 64 : index
%c1_16 = arith.constant 1 : index
%c64_17 = arith.constant 64 : index
%c64_18 = arith.constant 64 : index
%c2048_19 = arith.constant 2048 : index
%c1_20 = arith.constant 1 : index
%c64_21 = arith.constant 64 : index
%c64_22 = arith.constant 64 : index
%c1_23 = arith.constant 1 : index
%c1_24 = arith.constant 1 : index
%c4096 = arith.constant 4096 : index
%c4096_25 = arith.constant 4096 : index
air.dma_memcpy_nd (%arg13[%3, %4] [%c64_21, %c64_22] [%c2048_19, %c1_20], %alloc_9[%c0_11, %c0_12, %c0_13, %c0_14] [%c1_24, %c1_23, %c64_17, %c64_18] [%c4096_25, %c4096, %c64_15, %c1_16]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_4 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_9 : memref<1x1x64x64xi32, 1 : i32>
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
air.segment @segment_0 args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %arg9[%3, %c0_0] [%c64, %c512] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%alloc_1 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_1[] [] [], %arg10[%c0_0, %4] [%c512, %c64] [%c2048, %c1]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%alloc_2 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.herd @herd_0 tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%alloc_2, %arg17=%alloc, %arg18=%alloc_1) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c32_3 = arith.constant 32 : index
%c4096_4 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_5 = arith.constant 1 : index
%c512_6 = arith.constant 512 : index
%c2048_7 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_8 = arith.constant 0 : index
%c64_9 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg19 = %c0_8 to %c64_9 step %c4 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.dma_memcpy_nd (%alloc_11[] [] [], %arg17[%c0_8, %c0_8, %c0_8, %c0_8, %5, %7] [%c1_5, %c1_5, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_7, %c512_6, %c1_5]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.dma_memcpy_nd (%alloc_12[] [] [], %arg18[%c0_8, %c0_8, %c0_8, %c0_8, %7, %6] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_6, %c64_9, %c1_5]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_13: i32, %out: i32):
%8 = arith.muli %in, %in_13 : i32
%9 = arith.addi %out, %8 : i32
linalg.yield %9 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
air.dma_memcpy_nd (%arg16[%c0_8, %c0_8, %5, %6] [%c1_5, %c1_5, %c32_3, %c32_3] [%c4096_4, %c4096_4, %c64_9, %c1_5], %alloc_10[%c0_8, %c0_8, %c0_8, %c0_8, %c0_8, %c0_8] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_5]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
air.dma_memcpy_nd (%arg11[%3, %4] [%c64, %c64] [%c2048, %c1], %alloc_2[%c0_0, %c0_0, %c0_0, %c0_0] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_1 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x64x64xi32, 1 : i32>
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before AIRDependency (air-dependency) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
memref.assume_alignment %0, 64 : memref<2048x512xi32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
memref.assume_alignment %1, 64 : memref<512x2048xi32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
memref.assume_alignment %2, 64 : memref<2048x2048xi32>
air.launch (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%0, %arg5=%1, %arg6=%2) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
air.segment @segment_0 args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
%4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.dma_memcpy_nd (%alloc[] [] [], %arg9[%3, %c0_0] [%c64, %c512] [%c512, %c1]) : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%alloc_1 = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.dma_memcpy_nd (%alloc_1[] [] [], %arg10[%c0_0, %4] [%c512, %c64] [%c2048, %c1]) : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%alloc_2 = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.herd @herd_0 tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%alloc_2, %arg17=%alloc, %arg18=%alloc_1) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> {
%c32_3 = arith.constant 32 : index
%c4096_4 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_5 = arith.constant 1 : index
%c512_6 = arith.constant 512 : index
%c2048_7 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_8 = arith.constant 0 : index
%c64_9 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
%alloc_10 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>)
scf.for %arg19 = %c0_8 to %c64_9 step %c4 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
%alloc_11 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.dma_memcpy_nd (%alloc_11[] [] [], %arg17[%c0_8, %c0_8, %c0_8, %c0_8, %5, %7] [%c1_5, %c1_5, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_7, %c512_6, %c1_5]) : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%alloc_12 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.dma_memcpy_nd (%alloc_12[] [] [], %arg18[%c0_8, %c0_8, %c0_8, %c0_8, %7, %6] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_6, %c64_9, %c1_5]) : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_11, %alloc_12 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_13: i32, %out: i32):
%8 = arith.muli %in, %in_13 : i32
%9 = arith.addi %out, %8 : i32
linalg.yield %9 : i32
}
memref.dealloc %alloc_11 : memref<1x1x4x8x4x8xi32, 2 : i32>
memref.dealloc %alloc_12 : memref<1x1x8x4x8x4xi32, 2 : i32>
}
air.dma_memcpy_nd (%arg16[%c0_8, %c0_8, %5, %6] [%c1_5, %c1_5, %c32_3, %c32_3] [%c4096_4, %c4096_4, %c64_9, %c1_5], %alloc_10[%c0_8, %c0_8, %c0_8, %c0_8, %c0_8, %c0_8] [%c1_5, %c1_5, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_5]) : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
memref.dealloc %alloc_10 : memref<1x1x8x8x4x4xi32, 2 : i32>
air.herd_terminator
}
air.dma_memcpy_nd (%arg11[%3, %4] [%c64, %c64] [%c2048, %c1], %alloc_2[%c0_0, %c0_0, %c0_0, %c0_0] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
memref.dealloc %alloc : memref<1x1x64x512xi32, 1 : i32>
memref.dealloc %alloc_1 : memref<1x1x512x64xi32, 1 : i32>
memref.dealloc %alloc_2 : memref<1x1x64x64xi32, 1 : i32>
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before AIRDependencyScheduleOpt (air-dependency-schedule-opt) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%1 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_8, %results_9 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %6 : index
} {id = 7 : i32}
%async_token_10, %results_11 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %6 : index
} {id = 8 : i32}
%async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%4 = air.herd @herd_0 async [%2, %3, %async_token_16] tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
%c32_21 = arith.constant 32 : index
%c4096_22 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_23 = arith.constant 1 : index
%c512_24 = arith.constant 512 : index
%c2048_25 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_26 = arith.constant 0 : index
%c64_27 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%async_token_28, %results_29 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %9 : index
} {id = 12 : i32}
%async_token_30, %results_31 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %9 : index
} {id = 13 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_34 = air.execute [%async_token_32] {
linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34] {id = 2 : i32}
%7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
%c0_36 = arith.constant 0 : index
%c1_37 = arith.constant 1 : index
%c4_38 = arith.constant 4 : index
%c8_39 = arith.constant 8 : index
%c32768_40 = arith.constant 32768 : index
%c2048_41 = arith.constant 2048 : index
%c512_42 = arith.constant 512 : index
%c64_43 = arith.constant 64 : index
%async_token_44, %results_45 = air.execute [%arg20] -> (index) {
%12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
air.execute_terminator %12 : index
} {id = 16 : i32}
%async_token_46, %results_47 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%9 = air.dma_memcpy_nd async [%async_token_46, %async_token_44, %arg20] (%results_47[] [] [], %arg17[%c0_36, %c0_36, %c0_36, %c0_36, %results_29, %results_45] [%c1_37, %c1_37, %c4_38, %c8_39, %c4_38, %c8_39] [%c32768_40, %c32768_40, %c8_39, %c2048_41, %c512_42, %c1_37]) {id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%async_token_48, %results_49 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%10 = air.dma_memcpy_nd async [%async_token_48, %async_token_44, %arg20] (%results_49[] [] [], %arg18[%c0_36, %c0_36, %c0_36, %c0_36, %results_45, %results_31] [%c1_37, %c1_37, %c8_39, %c4_38, %c8_39, %c4_38] [%c32768_40, %c32768_40, %c4_38, %c512_42, %c64_43, %c1_37]) {id = 4 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
%async_token_50 = air.execute [%10, %9, %arg20] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_47, %results_49 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_53: i32, %out: i32):
%12 = arith.muli %in, %in_53 : i32
%13 = arith.addi %out, %12 : i32
linalg.yield %13 : i32
}
} {id = 19 : i32}
%async_token_51 = air.execute [%async_token_50] {
memref.dealloc %results_47 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_52 = air.execute [%async_token_50] {
memref.dealloc %results_49 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%11 = air.wait_all async [%arg20, %async_token_50] {id = 1 : i32}
scf.yield %11 : !air.async.token
}
%8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 5 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_35 = air.execute [%8] {
memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%5 = air.dma_memcpy_nd async [%4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 6 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
%async_token_18 = air.execute [%4] {
memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_19 = air.execute [%4] {
memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_20 = air.execute [%5] {
memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before AIRSpecializeDmaBroadcast (air-specialize-dma-broadcast) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%1 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_8, %results_9 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %6 : index
} {id = 7 : i32}
%async_token_10, %results_11 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %6 : index
} {id = 8 : i32}
%async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%4 = air.herd @herd_0 async [%2, %3, %async_token_16] tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
%c32_21 = arith.constant 32 : index
%c4096_22 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_23 = arith.constant 1 : index
%c512_24 = arith.constant 512 : index
%c2048_25 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_26 = arith.constant 0 : index
%c64_27 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%async_token_28, %results_29 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %9 : index
} {id = 12 : i32}
%async_token_30, %results_31 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %9 : index
} {id = 13 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_34 = air.execute [%async_token_32] {
linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34] {id = 2 : i32}
%7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
%async_token_36, %results_37 = air.execute [%arg20] -> (index) {
%12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
air.execute_terminator %12 : index
} {id = 16 : i32}
%async_token_38, %results_39 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%9 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_26, %c0_26, %c0_26, %c0_26, %results_29, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_pattern = affine_set<(d0, d1)[s0] : (d0 - s0 == 0, d1 >= 0, -d1 + 1 >= 0, s0 >= 0, -s0 + 1 >= 0)>, id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
%async_token_40, %results_41 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%10 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_26, %c0_26, %c0_26, %c0_26, %results_37, %results_31] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_pattern = affine_set<(d0, d1)[s0] : (d0 >= 0, -d0 + 1 >= 0, d1 - s0 == 0, s0 >= 0, -s0 + 1 >= 0)>, id = 4 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
%async_token_42 = air.execute [%10, %9, %arg20] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_39, %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_45: i32, %out: i32):
%12 = arith.muli %in, %in_45 : i32
%13 = arith.addi %out, %12 : i32
linalg.yield %13 : i32
}
} {id = 19 : i32}
%async_token_43 = air.execute [%async_token_42] {
memref.dealloc %results_39 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_44 = air.execute [%async_token_42] {
memref.dealloc %results_41 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%11 = air.wait_all async [%arg20, %async_token_42] {id = 1 : i32}
scf.yield %11 : !air.async.token
}
%8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 5 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_35 = air.execute [%8] {
memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%5 = air.dma_memcpy_nd async [%4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 6 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
%async_token_18 = air.execute [%4] {
memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_19 = air.execute [%4] {
memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_20 = air.execute [%5] {
memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before DmaToChannel (air-dma-to-channel) //----- //
module {
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%1 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
%c4096 = arith.constant 4096 : index
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_8, %results_9 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %6 : index
} {id = 7 : i32}
%async_token_10, %results_11 = air.execute -> (index) {
%6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %6 : index
} {id = 8 : i32}
%async_token_12, %results_13 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%2 = air.dma_memcpy_nd async [%async_token_8, %async_token_12] (%results_13[] [] [], %arg9[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) {id = 1 : i32} : (memref<1x1x64x512xi32, 1 : i32>, memref<2048x512xi32>)
%async_token_14, %results_15 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%3 = air.dma_memcpy_nd async [%async_token_10, %async_token_14] (%results_15[] [] [], %arg10[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) {id = 2 : i32} : (memref<1x1x512x64xi32, 1 : i32>, memref<512x2048xi32>)
%async_token_16, %results_17 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%4 = air.herd @herd_0 async [%2, %3, %async_token_16] tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_17, %arg17=%results_13, %arg18=%results_15) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
%c32_21 = arith.constant 32 : index
%c4096_22 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_23 = arith.constant 1 : index
%c512_24 = arith.constant 512 : index
%c2048_25 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_26 = arith.constant 0 : index
%c64_27 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%async_token_28, %results_29 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %9 : index
} {id = 12 : i32}
%async_token_30, %results_31 = air.execute -> (index) {
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %9 : index
} {id = 13 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_34 = air.execute [%async_token_32] {
linalg.fill ins(%c0_i32 : i32) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%6 = air.wait_all async [%async_token_28, %async_token_30, %async_token_34] {id = 2 : i32}
%7 = scf.for %arg19 = %c0_26 to %c64_27 step %c4 iter_args(%arg20 = %6) -> (!air.async.token) {
%async_token_36, %results_37 = air.execute [%arg20] -> (index) {
%12 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
air.execute_terminator %12 : index
} {id = 16 : i32}
%async_token_38, %results_39 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%9 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg12, %arg13] -> !air.async.token {
%c0_45 = arith.constant 0 : index
%c0_46 = arith.constant 0 : index
%c0_47 = arith.constant 0 : index
%c0_48 = arith.constant 0 : index
%c0_49 = arith.constant 0 : index
%12 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_49, %c0_48, %c0_47, %c0_46, %c0_45, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>, id = 3 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
affine.yield %12 : !air.async.token
} else {
%c32_45 = arith.constant 32 : index
%c0_46 = arith.constant 0 : index
%c0_47 = arith.constant 0 : index
%c0_48 = arith.constant 0 : index
%c0_49 = arith.constant 0 : index
%12 = air.dma_memcpy_nd async [%async_token_38, %async_token_36, %arg20] (%results_39[] [] [], %arg17[%c0_49, %c0_48, %c0_47, %c0_46, %c32_45, %results_37] [%c1_23, %c1_23, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_25, %c512_24, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 - 1 == 0, s1 >= 0, -s1 + 1 >= 0)>, id = 4 : i32} : (memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x64x512xi32, 1 : i32>)
affine.yield %12 : !air.async.token
}
%async_token_40, %results_41 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%10 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg12, %arg13] -> !air.async.token {
%c0_45 = arith.constant 0 : index
%c0_46 = arith.constant 0 : index
%c0_47 = arith.constant 0 : index
%c0_48 = arith.constant 0 : index
%c0_49 = arith.constant 0 : index
%12 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_49, %c0_48, %c0_47, %c0_46, %results_37, %c0_45] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>, id = 5 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
affine.yield %12 : !air.async.token
} else {
%c32_45 = arith.constant 32 : index
%c0_46 = arith.constant 0 : index
%c0_47 = arith.constant 0 : index
%c0_48 = arith.constant 0 : index
%c0_49 = arith.constant 0 : index
%12 = air.dma_memcpy_nd async [%async_token_40, %async_token_36, %arg20] (%results_41[] [] [], %arg18[%c0_49, %c0_48, %c0_47, %c0_46, %results_37, %c32_45] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_24, %c64_27, %c1_23]) {broadcast_set = affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 - 1 == 0)>, id = 6 : i32} : (memref<1x1x8x4x8x4xi32, 2 : i32>, memref<1x1x512x64xi32, 1 : i32>)
affine.yield %12 : !air.async.token
}
%async_token_42 = air.execute [%10, %9, %arg20] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_39, %results_41 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_45: i32, %out: i32):
%12 = arith.muli %in, %in_45 : i32
%13 = arith.addi %out, %12 : i32
linalg.yield %13 : i32
}
} {id = 19 : i32}
%async_token_43 = air.execute [%async_token_42] {
memref.dealloc %results_39 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_44 = air.execute [%async_token_42] {
memref.dealloc %results_41 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%11 = air.wait_all async [%arg20, %async_token_42] {id = 1 : i32}
scf.yield %11 : !air.async.token
}
%8 = air.dma_memcpy_nd async [%7] (%arg16[%c0_26, %c0_26, %results_29, %results_31] [%c1_23, %c1_23, %c32_21, %c32_21] [%c4096_22, %c4096_22, %c64_27, %c1_23], %results_33[%c0_26, %c0_26, %c0_26, %c0_26, %c0_26, %c0_26] [%c1_23, %c1_23, %c8, %c4, %c8, %c4] [%c1024, %c1024, %c16, %c4, %c128, %c1_23]) {id = 7 : i32} : (memref<1x1x64x64xi32, 1 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_35 = air.execute [%8] {
memref.dealloc %results_33 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%5 = air.dma_memcpy_nd async [%4] (%arg11[%results_9, %results_11] [%c64, %c64] [%c2048, %c1], %results_17[%c0_7, %c0_7, %c0_7, %c0_7] [%c1, %c1, %c64, %c64] [%c4096, %c4096, %c64, %c1]) {id = 8 : i32} : (memref<2048x2048xi32>, memref<1x1x64x64xi32, 1 : i32>)
%async_token_18 = air.execute [%4] {
memref.dealloc %results_13 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_19 = air.execute [%4] {
memref.dealloc %results_15 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_20 = air.execute [%5] {
memref.dealloc %results_17 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
air.channel @channel_7 [1, 1]
air.channel @channel_6 [2, 2]
air.channel @channel_5 [1, 1]
air.channel @channel_4 [1, 1]
air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%async_token_8, %results_9 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%1 = air.channel.put async [%async_token_8] @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
%c2048 = arith.constant 2048 : index
%c64_10 = arith.constant 64 : index
%c1_11 = arith.constant 1 : index
%c512_12 = arith.constant 512 : index
%c0_13 = arith.constant 0 : index
%async_token_14, %results_15 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%2 = air.channel.put async [%async_token_14] @channel_5[] (%arg5[%c0_13, %results_15] [%c512_12, %c64_10] [%c2048, %c1_11]) : (memref<512x2048xi32>)
%c2048_16 = arith.constant 2048 : index
%c64_17 = arith.constant 64 : index
%c1_18 = arith.constant 1 : index
%async_token_19, %results_20 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%async_token_21, %results_22 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%3 = air.channel.get async [%async_token_19, %async_token_21] @channel_7[] (%arg6[%results_20, %results_22] [%c64_17, %c64_17] [%c2048_16, %c1_18]) : (memref<2048x2048xi32>)
%4 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1, %arg9=%arg4, %arg10=%arg5, %arg11=%arg6) : index, index, memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 2 : i32} {
%c4096 = arith.constant 4096 : index
%c2048_23 = arith.constant 2048 : index
%c64_24 = arith.constant 64 : index
%c1_25 = arith.constant 1 : index
%c512_26 = arith.constant 512 : index
%c0_27 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_28, %results_29 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %23 : index
} {id = 7 : i32}
%async_token_30, %results_31 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %23 : index
} {id = 8 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%5 = air.channel.get async [%async_token_28, %async_token_32] @channel_4[] (%results_33[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
%async_token_34, %results_35 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%6 = air.channel.get async [%async_token_30, %async_token_34] @channel_5[] (%results_35[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
%async_token_36, %results_37 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%c0_38 = arith.constant 0 : index
%c1_39 = arith.constant 1 : index
%c512_40 = arith.constant 512 : index
%c2048_41 = arith.constant 2048 : index
%c8 = arith.constant 8 : index
%c32768 = arith.constant 32768 : index
%c0_42 = arith.constant 0 : index
%c64_43 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%async_token_44, %results_45 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
air.execute_terminator %23 : index
} {id = 12 : i32}
%async_token_46, %results_47 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
air.execute_terminator %23 : index
} {id = 13 : i32}
%async_token_48, %results_49 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%7 = air.wait_all async
%8 = air.wait_all async [%async_token_32, %5, %async_token_44, %async_token_46, %7] {id = 2 : i32}
%9 = scf.for %arg12 = %c0_42 to %c64_43 step %c4 iter_args(%arg13 = %8) -> (!air.async.token) {
%c1_99 = arith.constant 1 : index
%c4_100 = arith.constant 4 : index
%c8_101 = arith.constant 8 : index
%c32768_102 = arith.constant 32768 : index
%c2048_103 = arith.constant 2048 : index
%c512_104 = arith.constant 512 : index
%c64_105 = arith.constant 64 : index
%async_token_106, %results_107 = air.execute [%arg13] -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
air.execute_terminator %25 : index
} {id = 16 : i32}
%c0_108 = arith.constant 0 : index
%c0_109 = arith.constant 0 : index
%c0_110 = arith.constant 0 : index
%c0_111 = arith.constant 0 : index
%c0_112 = arith.constant 0 : index
%23 = air.channel.put async [%async_token_106, %async_token_32, %arg13] @channel_0[] (%results_33[%c0_112, %c0_111, %c0_110, %c0_109, %c0_108, %results_107] [%c1_39, %c1_39, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_41, %c512_40, %c1_39]) : (memref<1x1x64x512xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.yield %24 : !air.async.token
}
%c1_50 = arith.constant 1 : index
%c512_51 = arith.constant 512 : index
%c2048_52 = arith.constant 2048 : index
%c8_53 = arith.constant 8 : index
%c32768_54 = arith.constant 32768 : index
%c0_55 = arith.constant 0 : index
%c64_56 = arith.constant 64 : index
%c4_57 = arith.constant 4 : index
%async_token_58, %results_59 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
air.execute_terminator %23 : index
} {id = 12 : i32}
%async_token_60, %results_61 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_38]
air.execute_terminator %23 : index
} {id = 13 : i32}
%async_token_62, %results_63 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%10 = air.wait_all async
%11 = air.wait_all async [%async_token_32, %5, %async_token_58, %async_token_60, %10] {id = 2 : i32}
%12 = scf.for %arg12 = %c0_55 to %c64_56 step %c4_57 iter_args(%arg13 = %11) -> (!air.async.token) {
%c1_99 = arith.constant 1 : index
%c4_100 = arith.constant 4 : index
%c8_101 = arith.constant 8 : index
%c32768_102 = arith.constant 32768 : index
%c2048_103 = arith.constant 2048 : index
%c512_104 = arith.constant 512 : index
%c64_105 = arith.constant 64 : index
%async_token_106, %results_107 = air.execute [%arg13] -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
air.execute_terminator %25 : index
} {id = 16 : i32}
%c32_108 = arith.constant 32 : index
%c0_109 = arith.constant 0 : index
%c0_110 = arith.constant 0 : index
%c0_111 = arith.constant 0 : index
%c0_112 = arith.constant 0 : index
%23 = air.channel.put async [%async_token_106, %async_token_32, %arg13] @channel_1[] (%results_33[%c0_112, %c0_111, %c0_110, %c0_109, %c32_108, %results_107] [%c1_50, %c1_50, %c4_57, %c8_53, %c4_57, %c8_53] [%c32768_54, %c32768_54, %c8_53, %c2048_52, %c512_51, %c1_50]) : (memref<1x1x64x512xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.yield %24 : !air.async.token
}
%c0_64 = arith.constant 0 : index
%c1_65 = arith.constant 1 : index
%c512_66 = arith.constant 512 : index
%c8_67 = arith.constant 8 : index
%c32768_68 = arith.constant 32768 : index
%c0_69 = arith.constant 0 : index
%c64_70 = arith.constant 64 : index
%c4_71 = arith.constant 4 : index
%async_token_72, %results_73 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
air.execute_terminator %23 : index
} {id = 12 : i32}
%async_token_74, %results_75 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
air.execute_terminator %23 : index
} {id = 13 : i32}
%async_token_76, %results_77 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%13 = air.wait_all async
%14 = air.wait_all async [%async_token_34, %6, %async_token_72, %async_token_74, %13] {id = 2 : i32}
%15 = scf.for %arg12 = %c0_69 to %c64_70 step %c4_71 iter_args(%arg13 = %14) -> (!air.async.token) {
%c1_99 = arith.constant 1 : index
%c8_100 = arith.constant 8 : index
%c4_101 = arith.constant 4 : index
%c32768_102 = arith.constant 32768 : index
%c512_103 = arith.constant 512 : index
%c64_104 = arith.constant 64 : index
%async_token_105, %results_106 = air.execute [%arg13] -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
air.execute_terminator %25 : index
} {id = 16 : i32}
%c0_107 = arith.constant 0 : index
%c0_108 = arith.constant 0 : index
%c0_109 = arith.constant 0 : index
%c0_110 = arith.constant 0 : index
%c0_111 = arith.constant 0 : index
%23 = air.channel.put async [%async_token_105, %async_token_34, %arg13] @channel_2[] (%results_35[%c0_111, %c0_110, %c0_109, %c0_108, %results_106, %c0_107] [%c1_65, %c1_65, %c8_67, %c4_71, %c8_67, %c4_71] [%c32768_68, %c32768_68, %c4_71, %c512_66, %c64_70, %c1_65]) : (memref<1x1x512x64xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.yield %24 : !air.async.token
}
%c1_78 = arith.constant 1 : index
%c512_79 = arith.constant 512 : index
%c8_80 = arith.constant 8 : index
%c32768_81 = arith.constant 32768 : index
%c0_82 = arith.constant 0 : index
%c64_83 = arith.constant 64 : index
%c4_84 = arith.constant 4 : index
%async_token_85, %results_86 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
air.execute_terminator %23 : index
} {id = 12 : i32}
%async_token_87, %results_88 = air.execute -> (index) {
%23 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%c0_64]
air.execute_terminator %23 : index
} {id = 13 : i32}
%async_token_89, %results_90 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%16 = air.wait_all async
%17 = air.wait_all async [%async_token_34, %6, %async_token_85, %async_token_87, %16] {id = 2 : i32}
%18 = scf.for %arg12 = %c0_82 to %c64_83 step %c4_84 iter_args(%arg13 = %17) -> (!air.async.token) {
%c1_99 = arith.constant 1 : index
%c8_100 = arith.constant 8 : index
%c4_101 = arith.constant 4 : index
%c32768_102 = arith.constant 32768 : index
%c512_103 = arith.constant 512 : index
%c64_104 = arith.constant 64 : index
%async_token_105, %results_106 = air.execute [%arg13] -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg12]
air.execute_terminator %25 : index
} {id = 16 : i32}
%c32_107 = arith.constant 32 : index
%c0_108 = arith.constant 0 : index
%c0_109 = arith.constant 0 : index
%c0_110 = arith.constant 0 : index
%c0_111 = arith.constant 0 : index
%23 = air.channel.put async [%async_token_105, %async_token_34, %arg13] @channel_3[] (%results_35[%c0_111, %c0_110, %c0_109, %c0_108, %results_106, %c32_107] [%c1_78, %c1_78, %c8_80, %c4_84, %c8_80, %c4_84] [%c32768_81, %c32768_81, %c4_84, %c512_79, %c64_83, %c1_78]) : (memref<1x1x512x64xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.yield %24 : !air.async.token
}
%c1_91 = arith.constant 1 : index
%c0_92 = arith.constant 0 : index
%c0_93 = arith.constant 0 : index
%c2_94 = arith.constant 2 : index
%c2_95 = arith.constant 2 : index
%19 = air.wait_all async [%async_token_36]
%20 = scf.parallel (%arg12, %arg13) = (%c0_92, %c0_93) to (%c2_94, %c2_95) step (%c1_91, %c1_91) init (%19) -> !air.async.token {
%c32_99 = arith.constant 32 : index
%c4096_100 = arith.constant 4096 : index
%c1_101 = arith.constant 1 : index
%c0_102 = arith.constant 0 : index
%c64_103 = arith.constant 64 : index
%async_token_104, %results_105 = air.execute -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %25 : index
} {id = 12 : i32}
%async_token_106, %results_107 = air.execute -> (index) {
%25 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %25 : index
} {id = 13 : i32}
%23 = air.channel.get async [%async_token_106, %async_token_104, %async_token_36, %19] @channel_6[%arg12, %arg13] (%results_37[%c0_102, %c0_102, %results_105, %results_107] [%c1_101, %c1_101, %c32_99, %c32_99] [%c4096_100, %c4096_100, %c64_103, %c1_101]) : (memref<1x1x64x64xi32, 1 : i32>)
%24 = air.wait_all async [%23]
scf.reduce(%24 : !air.async.token) {
^bb0(%arg14: !air.async.token, %arg15: !air.async.token):
%25 = air.wait_all async [%arg14, %arg15]
scf.reduce.return %25 : !air.async.token
}
}
%21 = air.herd @herd_0 async [%5, %6, %async_token_36] tile (%arg12, %arg13) in (%arg14=%c2, %arg15=%c2) args(%arg16=%results_37, %arg17=%results_33, %arg18=%results_35) : memref<1x1x64x64xi32, 1 : i32>, memref<1x1x64x512xi32, 1 : i32>, memref<1x1x512x64xi32, 1 : i32> attributes {id = 1 : i32} {
%c32_99 = arith.constant 32 : index
%c4096_100 = arith.constant 4096 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_101 = arith.constant 1 : index
%c512_102 = arith.constant 512 : index
%c2048_103 = arith.constant 2048 : index
%c8_104 = arith.constant 8 : index
%c32768_105 = arith.constant 32768 : index
%c0_i32 = arith.constant 0 : i32
%c0_106 = arith.constant 0 : index
%c64_107 = arith.constant 64 : index
%c4_108 = arith.constant 4 : index
%async_token_109, %results_110 = air.execute -> (index) {
%26 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg12]
air.execute_terminator %26 : index
} {id = 12 : i32}
%async_token_111, %results_112 = air.execute -> (index) {
%26 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg13]
air.execute_terminator %26 : index
} {id = 13 : i32}
%async_token_113, %results_114 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_115 = air.execute [%async_token_113] {
linalg.fill ins(%c0_i32 : i32) outs(%results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%23 = air.wait_all async [%async_token_109, %async_token_111, %async_token_115] {id = 2 : i32}
%24 = scf.for %arg19 = %c0_106 to %c64_107 step %c4_108 iter_args(%arg20 = %23) -> (!air.async.token) {
%async_token_117, %results_118 = air.execute [%arg20] -> (index) {
%29 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg19]
air.execute_terminator %29 : index
} {id = 16 : i32}
%async_token_119, %results_120 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%26 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg12, %arg13] -> !air.async.token {
%c0_126 = arith.constant 0 : index
%c0_127 = arith.constant 0 : index
%c0_128 = arith.constant 0 : index
%c0_129 = arith.constant 0 : index
%c0_130 = arith.constant 0 : index
%29 = air.channel.get async [%async_token_119, %async_token_117, %arg20] @channel_0[%arg12, %arg13] (%results_120[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %29 : !air.async.token
} else {
%c32_126 = arith.constant 32 : index
%c0_127 = arith.constant 0 : index
%c0_128 = arith.constant 0 : index
%c0_129 = arith.constant 0 : index
%c0_130 = arith.constant 0 : index
%29 = air.channel.get async [%async_token_119, %async_token_117, %arg20] @channel_1[%arg12, %arg13] (%results_120[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %29 : !air.async.token
}
%async_token_121, %results_122 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%27 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg12, %arg13] -> !air.async.token {
%c0_126 = arith.constant 0 : index
%c0_127 = arith.constant 0 : index
%c0_128 = arith.constant 0 : index
%c0_129 = arith.constant 0 : index
%c0_130 = arith.constant 0 : index
%29 = air.channel.get async [%async_token_121, %async_token_117, %arg20] @channel_2[%arg12, %arg13] (%results_122[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %29 : !air.async.token
} else {
%c32_126 = arith.constant 32 : index
%c0_127 = arith.constant 0 : index
%c0_128 = arith.constant 0 : index
%c0_129 = arith.constant 0 : index
%c0_130 = arith.constant 0 : index
%29 = air.channel.get async [%async_token_121, %async_token_117, %arg20] @channel_3[%arg12, %arg13] (%results_122[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %29 : !air.async.token
}
%async_token_123 = air.execute [%27, %26, %arg20] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_120, %results_122 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_126: i32, %out: i32):
%29 = arith.muli %in, %in_126 : i32
%30 = arith.addi %out, %29 : i32
linalg.yield %30 : i32
}
} {id = 19 : i32}
%async_token_124 = air.execute [%async_token_123] {
memref.dealloc %results_120 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_125 = air.execute [%async_token_123] {
memref.dealloc %results_122 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%28 = air.wait_all async [%arg20, %async_token_123] {id = 1 : i32}
scf.yield %28 : !air.async.token
}
%25 = air.channel.put async [%24] @channel_6[%arg12, %arg13] (%results_114[%c0_106, %c0_106, %c0_106, %c0_106, %c0_106, %c0_106] [%c1_101, %c1_101, %c8_104, %c4_108, %c8_104, %c4_108] [%c1024, %c1024, %c16, %c4_108, %c128, %c1_101]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_116 = air.execute [%25] {
memref.dealloc %results_114 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%22 = air.channel.put async [%21] @channel_7[] (%results_37[%c0_27, %c0_27, %c0_27, %c0_27] [%c1_25, %c1_25, %c64_24, %c64_24] [%c4096, %c4096, %c64_24, %c1_25]) : (memref<1x1x64x64xi32, 1 : i32>)
%async_token_96 = air.execute [%21] {
memref.dealloc %results_33 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_97 = air.execute [%21] {
memref.dealloc %results_35 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_98 = air.execute [%22] {
memref.dealloc %results_37 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before CSE (cse) //----- //
module {
air.channel @channel_7 [1, 1]
air.channel @channel_6 [2, 2]
air.channel @channel_5 [1, 1]
air.channel @channel_4 [1, 1]
air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%async_token_8, %results_9 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%1 = air.channel.put async [%async_token_8] @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
%async_token_10, %results_11 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%2 = air.channel.put async [%async_token_10] @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) : (memref<512x2048xi32>)
%async_token_12, %results_13 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%async_token_14, %results_15 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%3 = air.channel.get async [%async_token_12, %async_token_14] @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) : (memref<2048x2048xi32>)
%4 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
%c32_16 = arith.constant 32 : index
%c4 = arith.constant 4 : index
%c32768 = arith.constant 32768 : index
%c8 = arith.constant 8 : index
%c4096 = arith.constant 4096 : index
%c2048_17 = arith.constant 2048 : index
%c64_18 = arith.constant 64 : index
%c1_19 = arith.constant 1 : index
%c512_20 = arith.constant 512 : index
%c0_21 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_22, %results_23 = air.execute -> (index) {
%18 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %18 : index
} {id = 7 : i32}
%async_token_24, %results_25 = air.execute -> (index) {
%18 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %18 : index
} {id = 8 : i32}
%async_token_26, %results_27 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%5 = air.channel.get async [%async_token_22, %async_token_26] @channel_4[] (%results_27[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
%async_token_28, %results_29 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%6 = air.channel.get async [%async_token_24, %async_token_28] @channel_5[] (%results_29[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
%async_token_30, %results_31 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%7 = air.wait_all async [%async_token_26, %5]
%8 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %7) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %19 : index
} {id = 16 : i32}
%18 = air.channel.put async [%async_token_43, %async_token_26, %arg10] @channel_0[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %18 : !air.async.token
}
%async_token_34, %results_35 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%9 = air.wait_all async [%async_token_26, %5]
%10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %9) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %19 : index
} {id = 16 : i32}
%18 = air.channel.put async [%async_token_43, %async_token_26, %arg10] @channel_1[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %18 : !air.async.token
}
%async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%11 = air.wait_all async [%async_token_28, %6]
%12 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %11) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %19 : index
} {id = 16 : i32}
%18 = air.channel.put async [%async_token_43, %async_token_28, %arg10] @channel_2[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %18 : !air.async.token
}
%async_token_38, %results_39 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%13 = air.wait_all async [%async_token_28, %6]
%14 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %13) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %19 : index
} {id = 16 : i32}
%18 = air.channel.put async [%async_token_43, %async_token_28, %arg10] @channel_3[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %18 : !air.async.token
}
%15 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_30) -> !air.async.token {
%async_token_43, %results_44 = air.execute -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
air.execute_terminator %19 : index
} {id = 12 : i32}
%async_token_45, %results_46 = air.execute -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
air.execute_terminator %19 : index
} {id = 13 : i32}
%18 = air.channel.get async [%async_token_45, %async_token_43, %async_token_30, %async_token_30] @channel_6[%arg9, %arg10] (%results_31[%c0_21, %c0_21, %results_44, %results_46] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
scf.reduce(%18 : !air.async.token) {
^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
%19 = air.wait_all async [%arg11, %arg12]
scf.reduce.return %19 : !air.async.token
}
}
%16 = air.herd @herd_0 async [%5, %6, %async_token_30] tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 1 : i32} {
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_43 = arith.constant 1 : index
%c8_44 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%c0_45 = arith.constant 0 : index
%c64_46 = arith.constant 64 : index
%c4_47 = arith.constant 4 : index
%async_token_48, %results_49 = air.execute -> (index) {
%21 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
air.execute_terminator %21 : index
} {id = 12 : i32}
%async_token_50, %results_51 = air.execute -> (index) {
%21 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
air.execute_terminator %21 : index
} {id = 13 : i32}
%async_token_52, %results_53 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_54 = air.execute [%async_token_52] {
linalg.fill ins(%c0_i32 : i32) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%18 = air.wait_all async [%async_token_48, %async_token_50, %async_token_54] {id = 2 : i32}
%19 = scf.for %arg13 = %c0_45 to %c64_46 step %c4_47 iter_args(%arg14 = %18) -> (!air.async.token) {
%async_token_56, %results_57 = air.execute [%arg14] -> (index) {
%24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg13]
air.execute_terminator %24 : index
} {id = 16 : i32}
%async_token_58, %results_59 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%21 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
%24 = air.channel.get async [%async_token_58, %async_token_56, %arg14] @channel_0[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %24 : !air.async.token
} else {
%24 = air.channel.get async [%async_token_58, %async_token_56, %arg14] @channel_1[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %24 : !air.async.token
}
%async_token_60, %results_61 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%22 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
%24 = air.channel.get async [%async_token_60, %async_token_56, %arg14] @channel_2[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %24 : !air.async.token
} else {
%24 = air.channel.get async [%async_token_60, %async_token_56, %arg14] @channel_3[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %24 : !air.async.token
}
%async_token_62 = air.execute [%22, %21, %arg14] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_59, %results_61 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_65: i32, %out: i32):
%24 = arith.muli %in, %in_65 : i32
%25 = arith.addi %out, %24 : i32
linalg.yield %25 : i32
}
} {id = 19 : i32}
%async_token_63 = air.execute [%async_token_62] {
memref.dealloc %results_59 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_64 = air.execute [%async_token_62] {
memref.dealloc %results_61 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%23 = air.wait_all async [%arg14, %async_token_62] {id = 1 : i32}
scf.yield %23 : !air.async.token
}
%20 = air.channel.put async [%19] @channel_6[%arg9, %arg10] (%results_53[%c0_45, %c0_45, %c0_45, %c0_45, %c0_45, %c0_45] [%c1_43, %c1_43, %c8_44, %c4_47, %c8_44, %c4_47] [%c1024, %c1024, %c16, %c4_47, %c128, %c1_43]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_55 = air.execute [%20] {
memref.dealloc %results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%17 = air.channel.put async [%16] @channel_7[] (%results_31[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
%async_token_40 = air.execute [%16] {
memref.dealloc %results_27 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_41 = air.execute [%16] {
memref.dealloc %results_29 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_42 = air.execute [%17] {
memref.dealloc %results_31 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before AIRDependencyCanonicalize (air-dependency-canonicalize) //----- //
module {
air.channel @channel_7 [1, 1]
air.channel @channel_6 [2, 2]
air.channel @channel_5 [1, 1]
air.channel @channel_4 [1, 1]
air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
} {id = 1 : i32}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
} {id = 2 : i32}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
} {id = 3 : i32}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
} {id = 4 : i32}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x2048xi32>
air.execute_terminator %1 : memref<2048x2048xi32>
} {id = 5 : i32}
%async_token_6 = air.execute [%async_token_4] {
memref.assume_alignment %results_5, 64 : memref<2048x2048xi32>
} {id = 6 : i32}
%0 = air.launch async [%async_token_0, %async_token_3, %async_token_6] (%arg0, %arg1) in (%arg2=%c32, %arg3=%c32) args(%arg4=%results, %arg5=%results_2, %arg6=%results_5) : memref<2048x512xi32>, memref<512x2048xi32>, memref<2048x2048xi32> attributes {id = 3 : i32} {
%c2048 = arith.constant 2048 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c512 = arith.constant 512 : index
%c0_7 = arith.constant 0 : index
%async_token_8, %results_9 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%1 = air.channel.put async [%async_token_8] @channel_4[] (%arg4[%results_9, %c0_7] [%c64, %c512] [%c512, %c1]) : (memref<2048x512xi32>)
%async_token_10, %results_11 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%2 = air.channel.put async [%async_token_10] @channel_5[] (%arg5[%c0_7, %results_11] [%c512, %c64] [%c2048, %c1]) : (memref<512x2048xi32>)
%async_token_12, %results_13 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg0]
air.execute_terminator %5 : index
} {id = 7 : i32}
%async_token_14, %results_15 = air.execute -> (index) {
%5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg1]
air.execute_terminator %5 : index
} {id = 8 : i32}
%3 = air.channel.get async [%async_token_12, %async_token_14] @channel_7[] (%arg6[%results_13, %results_15] [%c64, %c64] [%c2048, %c1]) : (memref<2048x2048xi32>)
%4 = air.segment @segment_0 async args(%arg7=%arg0, %arg8=%arg1) : index, index attributes {id = 2 : i32} {
%c32_16 = arith.constant 32 : index
%c4 = arith.constant 4 : index
%c32768 = arith.constant 32768 : index
%c8 = arith.constant 8 : index
%c4096 = arith.constant 4096 : index
%c2048_17 = arith.constant 2048 : index
%c64_18 = arith.constant 64 : index
%c1_19 = arith.constant 1 : index
%c512_20 = arith.constant 512 : index
%c0_21 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%async_token_22, %results_23 = air.execute -> (index) {
%18 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg7]
air.execute_terminator %18 : index
} {id = 7 : i32}
%async_token_24, %results_25 = air.execute -> (index) {
%18 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%arg8]
air.execute_terminator %18 : index
} {id = 8 : i32}
%async_token_26, %results_27 = air.execute -> (memref<1x1x64x512xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x512xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x512xi32, 1 : i32>
} {id = 9 : i32}
%5 = air.channel.get async [%async_token_22, %async_token_26] @channel_4[] (%results_27[] [] []) : (memref<1x1x64x512xi32, 1 : i32>)
%async_token_28, %results_29 = air.execute -> (memref<1x1x512x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x512x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x512x64xi32, 1 : i32>
} {id = 10 : i32}
%6 = air.channel.get async [%async_token_24, %async_token_28] @channel_5[] (%results_29[] [] []) : (memref<1x1x512x64xi32, 1 : i32>)
%async_token_30, %results_31 = air.execute -> (memref<1x1x64x64xi32, 1 : i32>) {
%alloc = memref.alloc() : memref<1x1x64x64xi32, 1 : i32>
air.execute_terminator %alloc : memref<1x1x64x64xi32, 1 : i32>
} {id = 11 : i32}
%async_token_32, %results_33 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%7 = air.wait_all async [%async_token_26, %5]
%8 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %7) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %19 : index
} {id = 16 : i32}
%18 = air.channel.put async [%async_token_43, %async_token_26, %arg10] @channel_0[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c0_21, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %18 : !air.async.token
}
%async_token_34, %results_35 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%9 = air.wait_all async [%async_token_26, %5]
%10 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %9) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %19 : index
} {id = 16 : i32}
%18 = air.channel.put async [%async_token_43, %async_token_26, %arg10] @channel_1[] (%results_27[%c0_21, %c0_21, %c0_21, %c0_21, %c32_16, %results_44] [%c1_19, %c1_19, %c4, %c8, %c4, %c8] [%c32768, %c32768, %c8, %c2048_17, %c512_20, %c1_19]) : (memref<1x1x64x512xi32, 1 : i32>)
scf.yield %18 : !air.async.token
}
%async_token_36, %results_37 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%11 = air.wait_all async [%async_token_28, %6]
%12 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %11) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %19 : index
} {id = 16 : i32}
%18 = air.channel.put async [%async_token_43, %async_token_28, %arg10] @channel_2[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c0_21] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %18 : !air.async.token
}
%async_token_38, %results_39 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%13 = air.wait_all async [%async_token_28, %6]
%14 = scf.for %arg9 = %c0_21 to %c64_18 step %c4 iter_args(%arg10 = %13) -> (!air.async.token) {
%async_token_43, %results_44 = air.execute [%arg10] -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg9]
air.execute_terminator %19 : index
} {id = 16 : i32}
%18 = air.channel.put async [%async_token_43, %async_token_28, %arg10] @channel_3[] (%results_29[%c0_21, %c0_21, %c0_21, %c0_21, %results_44, %c32_16] [%c1_19, %c1_19, %c8, %c4, %c8, %c4] [%c32768, %c32768, %c4, %c512_20, %c64_18, %c1_19]) : (memref<1x1x512x64xi32, 1 : i32>)
scf.yield %18 : !air.async.token
}
%15 = scf.parallel (%arg9, %arg10) = (%c0_21, %c0_21) to (%c2, %c2) step (%c1_19, %c1_19) init (%async_token_30) -> !air.async.token {
%async_token_43, %results_44 = air.execute -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
air.execute_terminator %19 : index
} {id = 12 : i32}
%async_token_45, %results_46 = air.execute -> (index) {
%19 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
air.execute_terminator %19 : index
} {id = 13 : i32}
%18 = air.channel.get async [%async_token_45, %async_token_43, %async_token_30, %async_token_30] @channel_6[%arg9, %arg10] (%results_31[%c0_21, %c0_21, %results_44, %results_46] [%c1_19, %c1_19, %c32_16, %c32_16] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
scf.reduce(%18 : !air.async.token) {
^bb0(%arg11: !air.async.token, %arg12: !air.async.token):
%19 = air.wait_all async [%arg11, %arg12]
scf.reduce.return %19 : !air.async.token
}
}
%16 = air.herd @herd_0 async [%5, %6, %async_token_30] tile (%arg9, %arg10) in (%arg11=%c2, %arg12=%c2) attributes {id = 1 : i32} {
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
%c1024 = arith.constant 1024 : index
%c1_43 = arith.constant 1 : index
%c8_44 = arith.constant 8 : index
%c0_i32 = arith.constant 0 : i32
%c0_45 = arith.constant 0 : index
%c64_46 = arith.constant 64 : index
%c4_47 = arith.constant 4 : index
%async_token_48, %results_49 = air.execute -> (index) {
%21 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg9]
air.execute_terminator %21 : index
} {id = 12 : i32}
%async_token_50, %results_51 = air.execute -> (index) {
%21 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%arg10]
air.execute_terminator %21 : index
} {id = 13 : i32}
%async_token_52, %results_53 = air.execute -> (memref<1x1x8x8x4x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 14 : i32}
%async_token_54 = air.execute [%async_token_52] {
linalg.fill ins(%c0_i32 : i32) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>)
} {id = 15 : i32}
%18 = air.wait_all async [%async_token_48, %async_token_50, %async_token_54] {id = 2 : i32}
%19 = scf.for %arg13 = %c0_45 to %c64_46 step %c4_47 iter_args(%arg14 = %18) -> (!air.async.token) {
%async_token_56, %results_57 = air.execute [%arg14] -> (index) {
%24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%arg13]
air.execute_terminator %24 : index
} {id = 16 : i32}
%async_token_58, %results_59 = air.execute -> (memref<1x1x4x8x4x8xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 17 : i32}
%21 = affine.if affine_set<()[s0, s1] : (s0 == 0, s1 >= 0, -s1 + 1 >= 0)>()[%arg9, %arg10] -> !air.async.token {
%24 = air.channel.get async [%async_token_58, %async_token_56, %arg14] @channel_0[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %24 : !air.async.token
} else {
%24 = air.channel.get async [%async_token_58, %async_token_56, %arg14] @channel_1[%arg9, %arg10] (%results_59[] [] []) : (memref<1x1x4x8x4x8xi32, 2 : i32>)
affine.yield %24 : !air.async.token
}
%async_token_60, %results_61 = air.execute -> (memref<1x1x8x4x8x4xi32, 2 : i32>) {
%alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
air.execute_terminator %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 18 : i32}
%22 = affine.if affine_set<()[s0, s1] : (s0 >= 0, -s0 + 1 >= 0, s1 == 0)>()[%arg9, %arg10] -> !air.async.token {
%24 = air.channel.get async [%async_token_60, %async_token_56, %arg14] @channel_2[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %24 : !air.async.token
} else {
%24 = air.channel.get async [%async_token_60, %async_token_56, %arg14] @channel_3[%arg9, %arg10] (%results_61[] [] []) : (memref<1x1x8x4x8x4xi32, 2 : i32>)
affine.yield %24 : !air.async.token
}
%async_token_62 = air.execute [%22, %21, %arg14] {
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%results_59, %results_61 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 0, 32, 32], [0, 0, 0, 0, 0, 4]]>, packing_config = #amdaie.packing_config<packing_config = [{packedSizes = [64, 64, 512], transposePackIndices = [1], unpackEmpty = [false], innerPerm = [[1, 0]], outerPerm = [[0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>} {
^bb0(%in: i32, %in_65: i32, %out: i32):
%24 = arith.muli %in, %in_65 : i32
%25 = arith.addi %out, %24 : i32
linalg.yield %25 : i32
}
} {id = 19 : i32}
%async_token_63 = air.execute [%async_token_62] {
memref.dealloc %results_59 : memref<1x1x4x8x4x8xi32, 2 : i32>
} {id = 20 : i32}
%async_token_64 = air.execute [%async_token_62] {
memref.dealloc %results_61 : memref<1x1x8x4x8x4xi32, 2 : i32>
} {id = 21 : i32}
%23 = air.wait_all async [%arg14, %async_token_62] {id = 1 : i32}
scf.yield %23 : !air.async.token
}
%20 = air.channel.put async [%19] @channel_6[%arg9, %arg10] (%results_53[%c0_45, %c0_45, %c0_45, %c0_45, %c0_45, %c0_45] [%c1_43, %c1_43, %c8_44, %c4_47, %c8_44, %c4_47] [%c1024, %c1024, %c16, %c4_47, %c128, %c1_43]) : (memref<1x1x8x8x4x4xi32, 2 : i32>)
%async_token_55 = air.execute [%20] {
memref.dealloc %results_53 : memref<1x1x8x8x4x4xi32, 2 : i32>
} {id = 22 : i32}
air.herd_terminator
}
%17 = air.channel.put async [%16] @channel_7[] (%results_31[%c0_21, %c0_21, %c0_21, %c0_21] [%c1_19, %c1_19, %c64_18, %c64_18] [%c4096, %c4096, %c64_18, %c1_19]) : (memref<1x1x64x64xi32, 1 : i32>)
%async_token_40 = air.execute [%16] {
memref.dealloc %results_27 : memref<1x1x64x512xi32, 1 : i32>
} {id = 23 : i32}
%async_token_41 = air.execute [%16] {
memref.dealloc %results_29 : memref<1x1x512x64xi32, 1 : i32>
} {id = 24 : i32}
%async_token_42 = air.execute [%17] {
memref.dealloc %results_31 : memref<1x1x64x64xi32, 1 : i32>
} {id = 25 : i32}
air.segment_terminator
}
air.launch_terminator
}
return
}
}
// -----// IR Dump Before Canonicalizer (canonicalize) //----- //
module {
air.channel @channel_7 [1, 1]
air.channel @channel_6 [2, 2]
air.channel @channel_5 [1, 1]
air.channel @channel_4 [1, 1]
air.channel @channel_3 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_2 [1, 1] {broadcast_shape = [2, 1]}
air.channel @channel_1 [1, 1] {broadcast_shape = [1, 2]}
air.channel @channel_0 [1, 1] {broadcast_shape = [1, 2]}
func.func @matmul_large_dispatch_0_matmul_2048x2048x512_i32() {
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%async_token, %results = air.execute -> (memref<2048x512xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x512xi32>
air.execute_terminator %1 : memref<2048x512xi32>
}
%async_token_0 = air.execute [%async_token] {
memref.assume_alignment %results, 64 : memref<2048x512xi32>
}
%async_token_1, %results_2 = air.execute -> (memref<512x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x2048xi32>
air.execute_terminator %1 : memref<512x2048xi32>
}
%async_token_3 = air.execute [%async_token_1] {
memref.assume_alignment %results_2, 64 : memref<512x2048xi32>
}
%async_token_4, %results_5 = air.execute -> (memref<2048x2048xi32>) {
%1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment