Skip to content

Instantly share code, notes, and snippets.

@jtuyls
Last active March 5, 2024 17:05
Show Gist options
  • Save jtuyls/7e6a41619666fa3186b1a8156978eedc to your computer and use it in GitHub Desktop.
Save jtuyls/7e6a41619666fa3186b1a8156978eedc to your computer and use it in GitHub Desktop.
Peeled matmul to AIE code
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
%c64 = arith.constant 64 : index
%c1024 = arith.constant 1024 : index
%c0_i32 = arith.constant 0 : i32
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
%alloc_2 = memref.alloc() : memref<32x64xi32, 1>
%alloc_3 = memref.alloc() : memref<64x64xi32, 1>
%alloc_4 = memref.alloc() : memref<32x64xi32, 1>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
scf.forall (%arg2, %arg3) in (1, 1) {
%3 = affine.apply #map(%arg2)
%4 = affine.apply #map1(%arg3)
%subview = memref.subview %alloca[%3, %4] [32, 64] [1, 1] : memref<32x64xi32> to memref<32x64xi32, strided<[64, 1], offset: ?>>
%subview_5 = memref.subview %1[0, 0] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
%subview_6 = memref.subview %0[0, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
linalg.copy ins(%subview_5 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>)
linalg.copy ins(%subview_6 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>)
scf.forall (%arg4, %arg5) in (1, 2) {
%5 = affine.apply #map(%arg4)
%6 = affine.apply #map(%arg5)
%subview_7 = memref.subview %alloc_4[%5, 0] [32, 64] [1, 1] : memref<32x64xi32, 1> to memref<32x64xi32, strided<[64, 1], offset: ?>, 1>
%subview_8 = memref.subview %alloc_3[0, %6] [64, 32] [1, 1] : memref<64x64xi32, 1> to memref<64x32xi32, strided<[64, 1], offset: ?>, 1>
%subview_9 = memref.subview %alloc_2[%5, %6] [32, 32] [1, 1] : memref<32x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<4x8x4x8xi32, 2>)
iree_linalg_ext.pack %subview_7 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_0 : (memref<32x64xi32, strided<[64, 1], offset: ?>, 1> memref<8x8x4x8xi32, 2>)
iree_linalg_ext.pack %subview_8 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc : (memref<64x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x8x8x8xi32, 2>)
linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
^bb0(%in: i32, %in_10: i32, %out: i32):
%7 = arith.muli %in, %in_10 : i32
%8 = arith.addi %out, %7 : i32
linalg.yield %8 : i32
}
iree_linalg_ext.unpack %alloc_1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_9 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[64, 1], offset: ?>, 1>)
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>)
scf.for %arg4 = %c64 to %c1024 step %c64 {
%subview_7 = memref.subview %1[0, %arg4] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
%subview_8 = memref.subview %0[%arg4, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
linalg.copy ins(%subview_7 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>)
linalg.copy ins(%subview_8 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>)
linalg.copy ins(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) outs(%alloc_2 : memref<32x64xi32, 1>)
scf.forall (%arg5, %arg6) in (1, 2) {
%5 = affine.apply #map(%arg5)
%6 = affine.apply #map(%arg6)
%subview_9 = memref.subview %alloc_4[%5, 0] [32, 64] [1, 1] : memref<32x64xi32, 1> to memref<32x64xi32, strided<[64, 1], offset: ?>, 1>
%subview_10 = memref.subview %alloc_3[0, %6] [64, 32] [1, 1] : memref<64x64xi32, 1> to memref<64x32xi32, strided<[64, 1], offset: ?>, 1>
%subview_11 = memref.subview %alloc_2[%5, %6] [32, 32] [1, 1] : memref<32x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
iree_linalg_ext.pack %subview_11 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_1 : (memref<32x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x8x4x8xi32, 2>)
iree_linalg_ext.pack %subview_9 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_0 : (memref<32x64xi32, strided<[64, 1], offset: ?>, 1> memref<8x8x4x8xi32, 2>)
iree_linalg_ext.pack %subview_10 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc : (memref<64x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x8x8x8xi32, 2>)
linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
^bb0(%in: i32, %in_12: i32, %out: i32):
%7 = arith.muli %in, %in_12 : i32
%8 = arith.addi %out, %7 : i32
linalg.yield %8 : i32
}
iree_linalg_ext.unpack %alloc_1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_11 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[64, 1], offset: ?>, 1>)
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>)
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%2 = bufferization.to_tensor %alloca : memref<32x64xi32>
memref.dealloc %alloc_4 : memref<32x64xi32, 1>
memref.dealloc %alloc_3 : memref<64x64xi32, 1>
memref.dealloc %alloc_2 : memref<32x64xi32, 1>
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
return %2 : tensor<32x64xi32>
}
}
// -----// IR Dump After PackToDma (iree-amdaie-pack-to-dma) //----- //
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
%c512 = arith.constant 512 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c256 = arith.constant 256 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c1024 = arith.constant 1024 : index
%c0_i32 = arith.constant 0 : i32
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
%alloc_2 = memref.alloc() : memref<32x64xi32, 1>
%alloc_3 = memref.alloc() : memref<64x64xi32, 1>
%alloc_4 = memref.alloc() : memref<32x64xi32, 1>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
scf.forall (%arg2, %arg3) in (1, 1) {
%3 = affine.apply #map(%arg2)
%4 = affine.apply #map1(%arg3)
%subview = memref.subview %alloca[%3, %4] [32, 64] [1, 1] : memref<32x64xi32> to memref<32x64xi32, strided<[64, 1], offset: ?>>
%subview_5 = memref.subview %1[0, 0] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
%subview_6 = memref.subview %0[0, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
linalg.copy ins(%subview_5 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>)
linalg.copy ins(%subview_6 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>)
scf.forall (%arg4, %arg5) in (1, 2) {
%5 = affine.apply #map(%arg4)
%6 = affine.apply #map(%arg5)
linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<4x8x4x8xi32, 2>)
air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %alloc_4[%c0, %c0, %5, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (memref<8x8x4x8xi32, 2>, memref<32x64xi32, 1>)
air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %alloc_3[%c0, %c0, %c0, %6] [%c4, %c8, %c8, %c8] [%c8, %c512, %c64, %c1]) : (memref<4x8x8x8xi32, 2>, memref<64x64xi32, 1>)
linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
^bb0(%in: i32, %in_7: i32, %out: i32):
%7 = arith.muli %in, %in_7 : i32
%8 = arith.addi %out, %7 : i32
linalg.yield %8 : i32
}
air.dma_memcpy_nd (%alloc_2[%5, %6] [%c32, %c32] [%c64, %c1], %alloc_1[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (memref<32x64xi32, 1>, memref<4x8x4x8xi32, 2>)
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>)
scf.for %arg4 = %c64 to %c1024 step %c64 {
%subview_7 = memref.subview %1[0, %arg4] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
%subview_8 = memref.subview %0[%arg4, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
linalg.copy ins(%subview_7 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>)
linalg.copy ins(%subview_8 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>)
linalg.copy ins(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) outs(%alloc_2 : memref<32x64xi32, 1>)
scf.forall (%arg5, %arg6) in (1, 2) {
%5 = affine.apply #map(%arg5)
%6 = affine.apply #map(%arg6)
air.dma_memcpy_nd (%alloc_1[%c0, %c0, %c0, %c0] [%c4, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %alloc_2[%c0, %c0, %5, %6] [%c4, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (memref<4x8x4x8xi32, 2>, memref<32x64xi32, 1>)
air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %alloc_4[%c0, %c0, %5, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (memref<8x8x4x8xi32, 2>, memref<32x64xi32, 1>)
air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %alloc_3[%c0, %c0, %c0, %6] [%c4, %c8, %c8, %c8] [%c8, %c512, %c64, %c1]) : (memref<4x8x8x8xi32, 2>, memref<64x64xi32, 1>)
linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
^bb0(%in: i32, %in_9: i32, %out: i32):
%7 = arith.muli %in, %in_9 : i32
%8 = arith.addi %out, %7 : i32
linalg.yield %8 : i32
}
air.dma_memcpy_nd (%alloc_2[%5, %6] [%c32, %c32] [%c64, %c1], %alloc_1[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (memref<32x64xi32, 1>, memref<4x8x4x8xi32, 2>)
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>)
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%2 = bufferization.to_tensor %alloca : memref<32x64xi32>
memref.dealloc %alloc_4 : memref<32x64xi32, 1>
memref.dealloc %alloc_3 : memref<64x64xi32, 1>
memref.dealloc %alloc_2 : memref<32x64xi32, 1>
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
return %2 : tensor<32x64xi32>
}
}
// -----// IR Dump After DmaToObjectFifo (Conceptually, could be multiple transformations) //----- //
// Here, we insert the objectfifo state machines.
//
// Skipped steps among others:
// - Tile assignment to object fifo's, note that this potentially impacts complexity of figuring out whether objectfifos can be reused across peeled - nonpeeled part of reduction loop.
// -> Could potentially be solved using logical objectfifos on buffers/memories, with partial tile constraints. For example only on the tile destination side as this information would be needed as a starting point
// to derive which objectfifos could potentially be merged.
// - The aiex.ipu.load(%core) operation doesn't exist yet as far as I can tell. This would be needed to reload the AIE core `main` within a single xclbin.
//
// NOTES:
// - Parallized core code could still be represented more compactly at this stage using scf.forall instead of unrolling
// - Core code could still be represented using linalg.generic in earlier stages
// - Tiles and cores are non-ssa for simplicity here (tile_0_1, core_0_2)
// - Can objectfifos support multi producer - single consumer?
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
%c512 = arith.constant 512 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c256 = arith.constant 256 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c1024 = arith.constant 1024 : index
%c0_i32 = arith.constant 0 : i32
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
%alloc_2 = memref.alloc() : memref<32x64xi32, 1>
%alloc_3 = memref.alloc() : memref<64x64xi32, 1>
%alloc_4 = memref.alloc() : memref<32x64xi32, 1>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
scf.forall (%arg2, %arg3) in (1, 1) {
%3 = affine.apply #map(%arg2)
%4 = affine.apply #map1(%arg3)
%subview = memref.subview %alloca[%3, %4] [32, 64] [1, 1] : memref<32x64xi32> to memref<32x64xi32, strided<[64, 1], offset: ?>>
%subview_5 = memref.subview %1[0, 0] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
%subview_6 = memref.subview %0[0, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
%tile_0_1 = aie.tile(0, 1)
%tile_0_2 = aie.tile(0, 2)
%tile_1_1 = aie.tile(1, 1)
%tile_1_2 = aie.tile(1, 2)
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
aie.objectfifo.link [@memA] -> [@inA]()
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo.link [@outC0,outC1] -> [@memC]()
%core_0_2 = aie.core(%tile_0_2) {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
aie.end
} {link_with = "mm.o"}
%core_1_2 = aie.core(%tile_1_2) {
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC1(Produce, 1)
aie.end
} {link_with = "mm.o"}
func.func @fill_matmul(%arg0: memref<32x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<32x64xi32>) {
aiex.ipu.load(%core_0_2)
aiex.ipu.load(%core_1_2)
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
return
}
@fill_matmul(%subview_5, %subview_6, %subview)
scf.for %arg4 = %c64 to %c1024 step %c64 {
%subview_7 = memref.subview %1[0, %arg4] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
%subview_8 = memref.subview %0[%arg4, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
%tile_0_1 = aie.tile(0, 1)
%tile_0_2 = aie.tile(0, 2)
%tile_1_1 = aie.tile(1, 1)
%tile_1_2 = aie.tile(1, 2)
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
aie.objectfifo.link [@memA] -> [@inA]()
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo.link [@outC0,outC1] -> [@memC]()
%core_0_2 = aie.core(%tile_0_2) {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI0(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
aie.end
} {link_with = "mm.o"}
%core_1_2 = aie.core(%tile_1_2) {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI1(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
aie.end
} {link_with = "mm.o"}
func.func @matmul(%arg0: memref<32x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<32x64xi32>) {
aiex.ipu.load(%core_0_2)
aiex.ipu.load(%core_1_2)
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
return
}
@matmul(%subview_7, %subview_8, %subview)
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%2 = bufferization.to_tensor %alloca : memref<32x64xi32>
memref.dealloc %alloc_4 : memref<32x64xi32, 1>
memref.dealloc %alloc_3 : memref<64x64xi32, 1>
memref.dealloc %alloc_2 : memref<32x64xi32, 1>
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
return %2 : tensor<32x64xi32>
}
}
// -----// IR Dump After AieBringInLoopsIntoAieControlCode (Conceptually, could be multiple transformations) //----- //
// Steps:
// - Move subviews into aiex.ipu.dma_memcpy_nd
// - Move loops into AIE control code function
// NOTES:
// - aiex.ipu.dma_memcpy_nd strides seem to be 3-dimensional in examples (stride0 is assumed to be always 1?), I use four dimensions here, similar to air.dma_memcpy_nd
// - Tiles and cores are non-ssa for simplicity here (tile_0_1, core_0_2)
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
%c512 = arith.constant 512 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c256 = arith.constant 256 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c1024 = arith.constant 1024 : index
%c0_i32 = arith.constant 0 : i32
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
%alloc_2 = memref.alloc() : memref<32x64xi32, 1>
%alloc_3 = memref.alloc() : memref<64x64xi32, 1>
%alloc_4 = memref.alloc() : memref<32x64xi32, 1>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
scf.forall (%arg2, %arg3) in (1, 1) {
%3 = affine.apply #map(%arg2)
%4 = affine.apply #map1(%arg3)
%tile_0_1 = aie.tile(0, 1)
%tile_0_2 = aie.tile(0, 2)
%tile_1_1 = aie.tile(1, 1)
%tile_1_2 = aie.tile(1, 2)
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
aie.objectfifo.link [@memA] -> [@inA]()
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo.link [@outC0,outC1] -> [@memC]()
%core_0_2 = aie.core(%tile_0_2) {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
aie.end
} {link_with = "mm.o"}
%core_1_2 = aie.core(%tile_1_2) {
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC1(Produce, 1)
aie.end
} {link_with = "mm.o"}
func.func @fill_matmul(%arg0: memref<32x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<32x64xi32>) {
aiex.ipu.load(%core_0_2)
aiex.ipu.load(%core_1_2)
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
return
}
fill_matmul(%1, %0, %alloca)
%tile_0_1 = aie.tile(0, 1)
%tile_0_2 = aie.tile(0, 2)
%tile_1_1 = aie.tile(1, 1)
%tile_1_2 = aie.tile(1, 2)
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
aie.objectfifo.link [@memA] -> [@inA]()
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo.link [@outC0,outC1] -> [@memC]()
%core_0_2 = aie.core(%tile_0_2) {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI0(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
aie.end
} {link_with = "mm.o"}
%core_1_2 = aie.core(%tile_1_2) {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI1(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
aie.end
} {link_with = "mm.o"}
func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
scf.for %arg4 = %c64 to %c1024 step %c64 {
aiex.ipu.load(%core_0_2)
aiex.ipu.load(%core_1_2)
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
}
return
}
@matmul(%1, %0, %subview)
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%2 = bufferization.to_tensor %alloca : memref<32x64xi32>
memref.dealloc %alloc_4 : memref<32x64xi32, 1>
memref.dealloc %alloc_3 : memref<64x64xi32, 1>
memref.dealloc %alloc_2 : memref<32x64xi32, 1>
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
return %2 : tensor<32x64xi32>
}
}
// -----// IR Dump After AieBringInLoopsIntoAieCoreCode (Conceptually, could be multiple transformations) //----- //
// Move loops into AIE core code to avoid reloading the cores' `main` program.
// Steps:
// - Move subviews into aiex.ipu.dma_memcpy_nd
// - Move loops into AIE control code function
// NOTES:
// - aiex.ipu.dma_memcpy_nd strides seem to be 3-dimensional in examples (stride0 is assumed to be always 1?), I use four dimensions here, similar to air.dma_memcpy_nd
// - Tiles and cores are non-ssa for simplicity here (tile_0_1, core_0_2)
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
%c512 = arith.constant 512 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c256 = arith.constant 256 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c1024 = arith.constant 1024 : index
%c0_i32 = arith.constant 0 : i32
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
%alloc_2 = memref.alloc() : memref<32x64xi32, 1>
%alloc_3 = memref.alloc() : memref<64x64xi32, 1>
%alloc_4 = memref.alloc() : memref<32x64xi32, 1>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
scf.forall (%arg2, %arg3) in (1, 1) {
%3 = affine.apply #map(%arg2)
%4 = affine.apply #map1(%arg3)
%tile_0_1 = aie.tile(0, 1)
%tile_0_2 = aie.tile(0, 2)
%tile_1_1 = aie.tile(1, 1)
%tile_1_2 = aie.tile(1, 2)
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
aie.objectfifo.link [@memA] -> [@inA]()
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo.link [@outC0,outC1] -> [@memC]()
%core_0_2 = aie.core(%tile_0_2) {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
aie.end
} {link_with = "mm.o"}
%core_1_2 = aie.core(%tile_1_2) {
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC1(Produce, 1)
aie.end
} {link_with = "mm.o"}
func.func @fill_matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
aiex.ipu.load(%core_0_2)
aiex.ipu.load(%core_1_2)
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
return
}
@fill_matmul(%1, %0, %alloca)
%tile_0_1 = aie.tile(0, 1)
%tile_0_2 = aie.tile(0, 2)
%tile_1_1 = aie.tile(1, 1)
%tile_1_2 = aie.tile(1, 2)
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
aie.objectfifo.link [@memA] -> [@inA]()
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo.link [@outC0,outC1] -> [@memC]()
%core_0_2 = aie.core(%tile_0_2) {
scf.for %arg4 = %c64 to %c1024 step %c64 {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI0(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
}
aie.end
} {link_with = "mm.o"}
%core_1_2 = aie.core(%tile_1_2) {
scf.for %arg4 = %c64 to %c1024 step %c64 {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI1(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
}
aie.end
} {link_with = "mm.o"}
func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
aiex.ipu.load(%core_0_2)
aiex.ipu.load(%core_1_2)
scf.for %arg4 = %c64 to %c1024 step %c64 {
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
}
return
}
@matmul(%1, %0, %alloca)
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%2 = bufferization.to_tensor %alloca : memref<32x64xi32>
memref.dealloc %alloc_4 : memref<32x64xi32, 1>
memref.dealloc %alloc_3 : memref<64x64xi32, 1>
memref.dealloc %alloc_2 : memref<32x64xi32, 1>
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
return %2 : tensor<32x64xi32>
}
}
// -----// IR Dump After AieSuperimposeStateMachines (Conceptually, could be multiple transformations) //----- //
// Here, we check for overlapping objectfifo state machines, like we see above (peeled fill + matmul is a 'sub' state machine of unpeeled matmul)
// and keep the 'super' state machine
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
%c512 = arith.constant 512 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c256 = arith.constant 256 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c1024 = arith.constant 1024 : index
%c0_i32 = arith.constant 0 : i32
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
%alloc_2 = memref.alloc() : memref<32x64xi32, 1>
%alloc_3 = memref.alloc() : memref<64x64xi32, 1>
%alloc_4 = memref.alloc() : memref<32x64xi32, 1>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
scf.forall (%arg2, %arg3) in (1, 1) {
%3 = affine.apply #map(%arg2)
%4 = affine.apply #map1(%arg3)
%tile_0_1 = aie.tile(0, 1)
%tile_0_2 = aie.tile(0, 2)
%tile_1_1 = aie.tile(1, 1)
%tile_1_2 = aie.tile(1, 2)
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
aie.objectfifo.link [@memA] -> [@inA]()
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo.link [@outC0,outC1] -> [@memC]()
%core_0_2_ = aie.core(%tile_0_2) {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
aie.end
} {link_with = "mm.o"}
%core_1_2_ = aie.core(%tile_1_2) {
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC1(Produce, 1)
aie.end
} {link_with = "mm.o"}
%core_0_2 = aie.core(%tile_0_2) {
scf.for %arg4 = %c64 to %c1024 step %c64 {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI0(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
}
aie.end
} {link_with = "mm.o"}
%core_1_2 = aie.core(%tile_1_2) {
scf.for %arg4 = %c64 to %c1024 step %c64 {
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI1(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC1(Produce, 1)
}
aie.end
} {link_with = "mm.o"}
func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
aiex.ipu.load(%core_0_2_)
aiex.ipu.load(%core_1_2_)
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
aiex.ipu.load(%core_0_2)
aiex.ipu.load(%core_1_2)
scf.for %arg4 = %c64 to %c1024 step %c64 {
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
}
return
}
@matmul(%1, %0, %alloca)
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%2 = bufferization.to_tensor %alloca : memref<32x64xi32>
memref.dealloc %alloc_4 : memref<32x64xi32, 1>
memref.dealloc %alloc_3 : memref<64x64xi32, 1>
memref.dealloc %alloc_2 : memref<32x64xi32, 1>
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
return %2 : tensor<32x64xi32>
}
}
// -----// IR Dump After AieCombineCoreCode (Conceptually, could be multiple transformations) //----- //
// Here, we try to combine core code into a single code block (will become a single elf) to avoid reloading `main`
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
%c512 = arith.constant 512 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c256 = arith.constant 256 : index
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%c1024 = arith.constant 1024 : index
%c0_i32 = arith.constant 0 : i32
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
%alloc_2 = memref.alloc() : memref<32x64xi32, 1>
%alloc_3 = memref.alloc() : memref<64x64xi32, 1>
%alloc_4 = memref.alloc() : memref<32x64xi32, 1>
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
scf.forall (%arg2, %arg3) in (1, 1) {
%3 = affine.apply #map(%arg2)
%4 = affine.apply #map1(%arg3)
%tile_0_1 = aie.tile(0, 1)
%tile_0_2 = aie.tile(0, 2)
%tile_1_1 = aie.tile(1, 1)
%tile_1_2 = aie.tile(1, 2)
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
aie.objectfifo.link [@memA] -> [@inA]()
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
aie.objectfifo.link [@outC0,outC1] -> [@memC]()
%core_0_2 = aie.core(%tile_0_2) {
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
scf.for %arg4 = %c64 to %c1024 step %c64 {
%6 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%8 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%10 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%12 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%9, %11, %13, %7) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI0(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB0(Consume, 1)
aie.objectfifo.release @outC0(Produce, 1)
}
aie.end
} {link_with = "mm.o"}
%core_1_2 = aie.core(%tile_1_2) {
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC1(Produce, 1)
scf.for %arg4 = %c64 to %c1024 step %c64 {
%6 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%8 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
%10 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
%12 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
func.call @matmul_vectorized_i32(%9, %11, %13, %7) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
aie.objectfifo.release @inI1(Consume, 1)
aie.objectfifo.release @inA(Consume, 1)
aie.objectfifo.release @inB1(Consume, 1)
aie.objectfifo.release @outC1(Produce, 1)
}
aie.end
} {link_with = "mm.o"}
func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
aiex.ipu.load(%core_0_2)
aiex.ipu.load(%core_1_2)
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
scf.for %arg4 = %c64 to %c1024 step %c64 {
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
}
return
}
@matmul(%1, %0, %alloca)
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
%2 = bufferization.to_tensor %alloca : memref<32x64xi32>
memref.dealloc %alloc_4 : memref<32x64xi32, 1>
memref.dealloc %alloc_3 : memref<64x64xi32, 1>
memref.dealloc %alloc_2 : memref<32x64xi32, 1>
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
return %2 : tensor<32x64xi32>
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment