-
-
Save jtuyls/7e6a41619666fa3186b1a8156978eedc to your computer and use it in GitHub Desktop.
Peeled matmul to AIE code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#map = affine_map<(d0) -> (d0 * 32)> | |
#map1 = affine_map<(d0) -> (d0 * 64)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> | |
module { | |
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> { | |
%c64 = arith.constant 64 : index | |
%c1024 = arith.constant 1024 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>> | |
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>> | |
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2> | |
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2> | |
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2> | |
%alloc_2 = memref.alloc() : memref<32x64xi32, 1> | |
%alloc_3 = memref.alloc() : memref<64x64xi32, 1> | |
%alloc_4 = memref.alloc() : memref<32x64xi32, 1> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32> | |
scf.forall (%arg2, %arg3) in (1, 1) { | |
%3 = affine.apply #map(%arg2) | |
%4 = affine.apply #map1(%arg3) | |
%subview = memref.subview %alloca[%3, %4] [32, 64] [1, 1] : memref<32x64xi32> to memref<32x64xi32, strided<[64, 1], offset: ?>> | |
%subview_5 = memref.subview %1[0, 0] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>> | |
%subview_6 = memref.subview %0[0, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>> | |
linalg.copy ins(%subview_5 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>) | |
linalg.copy ins(%subview_6 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>) | |
scf.forall (%arg4, %arg5) in (1, 2) { | |
%5 = affine.apply #map(%arg4) | |
%6 = affine.apply #map(%arg5) | |
%subview_7 = memref.subview %alloc_4[%5, 0] [32, 64] [1, 1] : memref<32x64xi32, 1> to memref<32x64xi32, strided<[64, 1], offset: ?>, 1> | |
%subview_8 = memref.subview %alloc_3[0, %6] [64, 32] [1, 1] : memref<64x64xi32, 1> to memref<64x32xi32, strided<[64, 1], offset: ?>, 1> | |
%subview_9 = memref.subview %alloc_2[%5, %6] [32, 32] [1, 1] : memref<32x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1> | |
linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) | |
iree_linalg_ext.pack %subview_7 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_0 : (memref<32x64xi32, strided<[64, 1], offset: ?>, 1> memref<8x8x4x8xi32, 2>) | |
iree_linalg_ext.pack %subview_8 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc : (memref<64x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x8x8x8xi32, 2>) | |
linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { | |
^bb0(%in: i32, %in_10: i32, %out: i32): | |
%7 = arith.muli %in, %in_10 : i32 | |
%8 = arith.addi %out, %7 : i32 | |
linalg.yield %8 : i32 | |
} | |
iree_linalg_ext.unpack %alloc_1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_9 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[64, 1], offset: ?>, 1>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
%subview_7 = memref.subview %1[0, %arg4] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>> | |
%subview_8 = memref.subview %0[%arg4, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>> | |
linalg.copy ins(%subview_7 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>) | |
linalg.copy ins(%subview_8 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>) | |
linalg.copy ins(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) outs(%alloc_2 : memref<32x64xi32, 1>) | |
scf.forall (%arg5, %arg6) in (1, 2) { | |
%5 = affine.apply #map(%arg5) | |
%6 = affine.apply #map(%arg6) | |
%subview_9 = memref.subview %alloc_4[%5, 0] [32, 64] [1, 1] : memref<32x64xi32, 1> to memref<32x64xi32, strided<[64, 1], offset: ?>, 1> | |
%subview_10 = memref.subview %alloc_3[0, %6] [64, 32] [1, 1] : memref<64x64xi32, 1> to memref<64x32xi32, strided<[64, 1], offset: ?>, 1> | |
%subview_11 = memref.subview %alloc_2[%5, %6] [32, 32] [1, 1] : memref<32x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1> | |
iree_linalg_ext.pack %subview_11 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_1 : (memref<32x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x8x4x8xi32, 2>) | |
iree_linalg_ext.pack %subview_9 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_0 : (memref<32x64xi32, strided<[64, 1], offset: ?>, 1> memref<8x8x4x8xi32, 2>) | |
iree_linalg_ext.pack %subview_10 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc : (memref<64x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x8x8x8xi32, 2>) | |
linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { | |
^bb0(%in: i32, %in_12: i32, %out: i32): | |
%7 = arith.muli %in, %in_12 : i32 | |
%8 = arith.addi %out, %7 : i32 | |
linalg.yield %8 : i32 | |
} | |
iree_linalg_ext.unpack %alloc_1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_11 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[64, 1], offset: ?>, 1>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
%2 = bufferization.to_tensor %alloca : memref<32x64xi32> | |
memref.dealloc %alloc_4 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_3 : memref<64x64xi32, 1> | |
memref.dealloc %alloc_2 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2> | |
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2> | |
memref.dealloc %alloc : memref<4x8x8x8xi32, 2> | |
return %2 : tensor<32x64xi32> | |
} | |
} | |
// -----// IR Dump After PackToDma (iree-amdaie-pack-to-dma) //----- // | |
#map = affine_map<(d0) -> (d0 * 32)> | |
#map1 = affine_map<(d0) -> (d0 * 64)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> | |
module { | |
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> { | |
%c512 = arith.constant 512 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c256 = arith.constant 256 : index | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c1024 = arith.constant 1024 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>> | |
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>> | |
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2> | |
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2> | |
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2> | |
%alloc_2 = memref.alloc() : memref<32x64xi32, 1> | |
%alloc_3 = memref.alloc() : memref<64x64xi32, 1> | |
%alloc_4 = memref.alloc() : memref<32x64xi32, 1> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32> | |
scf.forall (%arg2, %arg3) in (1, 1) { | |
%3 = affine.apply #map(%arg2) | |
%4 = affine.apply #map1(%arg3) | |
%subview = memref.subview %alloca[%3, %4] [32, 64] [1, 1] : memref<32x64xi32> to memref<32x64xi32, strided<[64, 1], offset: ?>> | |
%subview_5 = memref.subview %1[0, 0] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>> | |
%subview_6 = memref.subview %0[0, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>> | |
linalg.copy ins(%subview_5 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>) | |
linalg.copy ins(%subview_6 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>) | |
scf.forall (%arg4, %arg5) in (1, 2) { | |
%5 = affine.apply #map(%arg4) | |
%6 = affine.apply #map(%arg5) | |
linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) | |
air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %alloc_4[%c0, %c0, %5, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (memref<8x8x4x8xi32, 2>, memref<32x64xi32, 1>) | |
air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %alloc_3[%c0, %c0, %c0, %6] [%c4, %c8, %c8, %c8] [%c8, %c512, %c64, %c1]) : (memref<4x8x8x8xi32, 2>, memref<64x64xi32, 1>) | |
linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { | |
^bb0(%in: i32, %in_7: i32, %out: i32): | |
%7 = arith.muli %in, %in_7 : i32 | |
%8 = arith.addi %out, %7 : i32 | |
linalg.yield %8 : i32 | |
} | |
air.dma_memcpy_nd (%alloc_2[%5, %6] [%c32, %c32] [%c64, %c1], %alloc_1[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (memref<32x64xi32, 1>, memref<4x8x4x8xi32, 2>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
%subview_7 = memref.subview %1[0, %arg4] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>> | |
%subview_8 = memref.subview %0[%arg4, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>> | |
linalg.copy ins(%subview_7 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>) | |
linalg.copy ins(%subview_8 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>) | |
linalg.copy ins(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) outs(%alloc_2 : memref<32x64xi32, 1>) | |
scf.forall (%arg5, %arg6) in (1, 2) { | |
%5 = affine.apply #map(%arg5) | |
%6 = affine.apply #map(%arg6) | |
air.dma_memcpy_nd (%alloc_1[%c0, %c0, %c0, %c0] [%c4, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %alloc_2[%c0, %c0, %5, %6] [%c4, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (memref<4x8x4x8xi32, 2>, memref<32x64xi32, 1>) | |
air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %alloc_4[%c0, %c0, %5, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (memref<8x8x4x8xi32, 2>, memref<32x64xi32, 1>) | |
air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %alloc_3[%c0, %c0, %c0, %6] [%c4, %c8, %c8, %c8] [%c8, %c512, %c64, %c1]) : (memref<4x8x8x8xi32, 2>, memref<64x64xi32, 1>) | |
linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { | |
^bb0(%in: i32, %in_9: i32, %out: i32): | |
%7 = arith.muli %in, %in_9 : i32 | |
%8 = arith.addi %out, %7 : i32 | |
linalg.yield %8 : i32 | |
} | |
air.dma_memcpy_nd (%alloc_2[%5, %6] [%c32, %c32] [%c64, %c1], %alloc_1[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (memref<32x64xi32, 1>, memref<4x8x4x8xi32, 2>) | |
} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} | |
linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
%2 = bufferization.to_tensor %alloca : memref<32x64xi32> | |
memref.dealloc %alloc_4 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_3 : memref<64x64xi32, 1> | |
memref.dealloc %alloc_2 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2> | |
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2> | |
memref.dealloc %alloc : memref<4x8x8x8xi32, 2> | |
return %2 : tensor<32x64xi32> | |
} | |
} | |
// -----// IR Dump After DmaToObjectFifo (Conceptually, could be multiple transformations) //----- // | |
// Here, we insert the objectfifo state machines. | |
// | |
// Skipped steps among others: | |
// - Tile assignment to object fifo's, note that this potentially impacts complexity of figuring out whether objectfifos can be reused across peeled - nonpeeled part of reduction loop. | |
// -> Could potentially be solved using logical objectfifos on buffers/memories, with partial tile constraints. For example only on the tile destination side as this information would be needed as a starting point | |
// to derive which objectfifos could potentially be merged. | |
// - The aiex.ipu.load(%core) operation doesn't exist yet as far as I can tell. This would be needed to reload the AIE core `main` within a single xclbin. | |
// | |
// NOTES: | |
// - Parallized core code could still be represented more compactly at this stage using scf.forall instead of unrolling | |
// - Core code could still be represented using linalg.generic in earlier stages | |
// - Tiles and cores are non-ssa for simplicity here (tile_0_1, core_0_2) | |
// - Can objectfifos support multi producer - single consumer? | |
#map = affine_map<(d0) -> (d0 * 32)> | |
#map1 = affine_map<(d0) -> (d0 * 64)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> | |
module { | |
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> { | |
%c512 = arith.constant 512 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c256 = arith.constant 256 : index | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c1024 = arith.constant 1024 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>> | |
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>> | |
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2> | |
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2> | |
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2> | |
%alloc_2 = memref.alloc() : memref<32x64xi32, 1> | |
%alloc_3 = memref.alloc() : memref<64x64xi32, 1> | |
%alloc_4 = memref.alloc() : memref<32x64xi32, 1> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32> | |
scf.forall (%arg2, %arg3) in (1, 1) { | |
%3 = affine.apply #map(%arg2) | |
%4 = affine.apply #map1(%arg3) | |
%subview = memref.subview %alloca[%3, %4] [32, 64] [1, 1] : memref<32x64xi32> to memref<32x64xi32, strided<[64, 1], offset: ?>> | |
%subview_5 = memref.subview %1[0, 0] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>> | |
%subview_6 = memref.subview %0[0, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>> | |
%tile_0_1 = aie.tile(0, 1) | |
%tile_0_2 = aie.tile(0, 2) | |
%tile_1_1 = aie.tile(1, 1) | |
%tile_1_2 = aie.tile(1, 2) | |
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>> | |
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>> | |
aie.objectfifo.link [@memA] -> [@inA]() | |
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo.link [@memB] -> [@inB0,@inB1]() | |
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo.link [@outC0,outC1] -> [@memC]() | |
%core_0_2 = aie.core(%tile_0_2) { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
%core_1_2 = aie.core(%tile_1_2) { | |
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC1(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
func.func @fill_matmul(%arg0: memref<32x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<32x64xi32>) { | |
aiex.ipu.load(%core_0_2) | |
aiex.ipu.load(%core_1_2) | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
return | |
} | |
@fill_matmul(%subview_5, %subview_6, %subview) | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
%subview_7 = memref.subview %1[0, %arg4] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>> | |
%subview_8 = memref.subview %0[%arg4, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>> | |
%tile_0_1 = aie.tile(0, 1) | |
%tile_0_2 = aie.tile(0, 2) | |
%tile_1_1 = aie.tile(1, 1) | |
%tile_1_2 = aie.tile(1, 2) | |
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>> | |
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo.link [@memI] -> [@inI0,@inI1]() | |
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>> | |
aie.objectfifo.link [@memA] -> [@inA]() | |
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo.link [@memB] -> [@inB0,@inB1]() | |
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo.link [@outC0,outC1] -> [@memC]() | |
%core_0_2 = aie.core(%tile_0_2) { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI0(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
%core_1_2 = aie.core(%tile_1_2) { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI1(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
func.func @matmul(%arg0: memref<32x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<32x64xi32>) { | |
aiex.ipu.load(%core_0_2) | |
aiex.ipu.load(%core_1_2) | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
return | |
} | |
@matmul(%subview_7, %subview_8, %subview) | |
} | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
%2 = bufferization.to_tensor %alloca : memref<32x64xi32> | |
memref.dealloc %alloc_4 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_3 : memref<64x64xi32, 1> | |
memref.dealloc %alloc_2 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2> | |
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2> | |
memref.dealloc %alloc : memref<4x8x8x8xi32, 2> | |
return %2 : tensor<32x64xi32> | |
} | |
} | |
// -----// IR Dump After AieBringInLoopsIntoAieControlCode (Conceptually, could be multiple transformations) //----- // | |
// Steps: | |
// - Move subviews into aiex.ipu.dma_memcpy_nd | |
// - Move loops into AIE control code function | |
// NOTES: | |
// - aiex.ipu.dma_memcpy_nd strides seem to be 3-dimensional in examples (stride0 is assumed to be always 1?), I use four dimensions here, similar to air.dma_memcpy_nd | |
// - Tiles and cores are non-ssa for simplicity here (tile_0_1, core_0_2) | |
#map = affine_map<(d0) -> (d0 * 32)> | |
#map1 = affine_map<(d0) -> (d0 * 64)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> | |
module { | |
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> { | |
%c512 = arith.constant 512 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c256 = arith.constant 256 : index | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c1024 = arith.constant 1024 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>> | |
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>> | |
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2> | |
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2> | |
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2> | |
%alloc_2 = memref.alloc() : memref<32x64xi32, 1> | |
%alloc_3 = memref.alloc() : memref<64x64xi32, 1> | |
%alloc_4 = memref.alloc() : memref<32x64xi32, 1> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32> | |
scf.forall (%arg2, %arg3) in (1, 1) { | |
%3 = affine.apply #map(%arg2) | |
%4 = affine.apply #map1(%arg3) | |
%tile_0_1 = aie.tile(0, 1) | |
%tile_0_2 = aie.tile(0, 2) | |
%tile_1_1 = aie.tile(1, 1) | |
%tile_1_2 = aie.tile(1, 2) | |
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>> | |
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>> | |
aie.objectfifo.link [@memA] -> [@inA]() | |
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo.link [@memB] -> [@inB0,@inB1]() | |
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo.link [@outC0,outC1] -> [@memC]() | |
%core_0_2 = aie.core(%tile_0_2) { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
%core_1_2 = aie.core(%tile_1_2) { | |
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC1(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
func.func @fill_matmul(%arg0: memref<32x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<32x64xi32>) { | |
aiex.ipu.load(%core_0_2) | |
aiex.ipu.load(%core_1_2) | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
return | |
} | |
fill_matmul(%1, %0, %alloca) | |
%tile_0_1 = aie.tile(0, 1) | |
%tile_0_2 = aie.tile(0, 2) | |
%tile_1_1 = aie.tile(1, 1) | |
%tile_1_2 = aie.tile(1, 2) | |
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>> | |
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo.link [@memI] -> [@inI0,@inI1]() | |
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>> | |
aie.objectfifo.link [@memA] -> [@inA]() | |
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo.link [@memB] -> [@inB0,@inB1]() | |
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo.link [@outC0,outC1] -> [@memC]() | |
%core_0_2 = aie.core(%tile_0_2) { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI0(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
%core_1_2 = aie.core(%tile_1_2) { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI1(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) { | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
aiex.ipu.load(%core_0_2) | |
aiex.ipu.load(%core_1_2) | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
} | |
return | |
} | |
@matmul(%1, %0, %subview) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
%2 = bufferization.to_tensor %alloca : memref<32x64xi32> | |
memref.dealloc %alloc_4 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_3 : memref<64x64xi32, 1> | |
memref.dealloc %alloc_2 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2> | |
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2> | |
memref.dealloc %alloc : memref<4x8x8x8xi32, 2> | |
return %2 : tensor<32x64xi32> | |
} | |
} | |
// -----// IR Dump After AieBringInLoopsIntoAieCoreCode (Conceptually, could be multiple transformations) //----- // | |
// Move loops into AIE core code to avoid reloading the cores' `main` program. | |
// Steps: | |
// - Move subviews into aiex.ipu.dma_memcpy_nd | |
// - Move loops into AIE control code function | |
// NOTES: | |
// - aiex.ipu.dma_memcpy_nd strides seem to be 3-dimensional in examples (stride0 is assumed to be always 1?), I use four dimensions here, similar to air.dma_memcpy_nd | |
// - Tiles and cores are non-ssa for simplicity here (tile_0_1, core_0_2) | |
#map = affine_map<(d0) -> (d0 * 32)> | |
#map1 = affine_map<(d0) -> (d0 * 64)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> | |
module { | |
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> { | |
%c512 = arith.constant 512 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c256 = arith.constant 256 : index | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c1024 = arith.constant 1024 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>> | |
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>> | |
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2> | |
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2> | |
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2> | |
%alloc_2 = memref.alloc() : memref<32x64xi32, 1> | |
%alloc_3 = memref.alloc() : memref<64x64xi32, 1> | |
%alloc_4 = memref.alloc() : memref<32x64xi32, 1> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32> | |
scf.forall (%arg2, %arg3) in (1, 1) { | |
%3 = affine.apply #map(%arg2) | |
%4 = affine.apply #map1(%arg3) | |
%tile_0_1 = aie.tile(0, 1) | |
%tile_0_2 = aie.tile(0, 2) | |
%tile_1_1 = aie.tile(1, 1) | |
%tile_1_2 = aie.tile(1, 2) | |
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>> | |
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>> | |
aie.objectfifo.link [@memA] -> [@inA]() | |
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo.link [@memB] -> [@inB0,@inB1]() | |
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo.link [@outC0,outC1] -> [@memC]() | |
%core_0_2 = aie.core(%tile_0_2) { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
%core_1_2 = aie.core(%tile_1_2) { | |
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC1(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
func.func @fill_matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) { | |
aiex.ipu.load(%core_0_2) | |
aiex.ipu.load(%core_1_2) | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
return | |
} | |
@fill_matmul(%1, %0, %alloca) | |
%tile_0_1 = aie.tile(0, 1) | |
%tile_0_2 = aie.tile(0, 2) | |
%tile_1_1 = aie.tile(1, 1) | |
%tile_1_2 = aie.tile(1, 2) | |
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>> | |
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo.link [@memI] -> [@inI0,@inI1]() | |
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>> | |
aie.objectfifo.link [@memA] -> [@inA]() | |
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo.link [@memB] -> [@inB0,@inB1]() | |
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo.link [@outC0,outC1] -> [@memC]() | |
%core_0_2 = aie.core(%tile_0_2) { | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI0(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
} | |
aie.end | |
} {link_with = "mm.o"} | |
%core_1_2 = aie.core(%tile_1_2) { | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI1(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
} | |
aie.end | |
} {link_with = "mm.o"} | |
func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) { | |
aiex.ipu.load(%core_0_2) | |
aiex.ipu.load(%core_1_2) | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
} | |
return | |
} | |
@matmul(%1, %0, %alloca) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
%2 = bufferization.to_tensor %alloca : memref<32x64xi32> | |
memref.dealloc %alloc_4 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_3 : memref<64x64xi32, 1> | |
memref.dealloc %alloc_2 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2> | |
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2> | |
memref.dealloc %alloc : memref<4x8x8x8xi32, 2> | |
return %2 : tensor<32x64xi32> | |
} | |
} | |
// -----// IR Dump After AieSuperimposeStateMachines (Conceptually, could be multiple transformations) //----- // | |
// Here, we check for overlapping objectfifo state machines, like we see above (peeled fill + matmul is a 'sub' state machine of unpeeled matmul) | |
// and keep the 'super' state machine | |
#map = affine_map<(d0) -> (d0 * 32)> | |
#map1 = affine_map<(d0) -> (d0 * 64)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> | |
module { | |
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> { | |
%c512 = arith.constant 512 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c256 = arith.constant 256 : index | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c1024 = arith.constant 1024 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>> | |
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>> | |
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2> | |
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2> | |
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2> | |
%alloc_2 = memref.alloc() : memref<32x64xi32, 1> | |
%alloc_3 = memref.alloc() : memref<64x64xi32, 1> | |
%alloc_4 = memref.alloc() : memref<32x64xi32, 1> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32> | |
scf.forall (%arg2, %arg3) in (1, 1) { | |
%3 = affine.apply #map(%arg2) | |
%4 = affine.apply #map1(%arg3) | |
%tile_0_1 = aie.tile(0, 1) | |
%tile_0_2 = aie.tile(0, 2) | |
%tile_1_1 = aie.tile(1, 1) | |
%tile_1_2 = aie.tile(1, 2) | |
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>> | |
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo.link [@memI] -> [@inI0,@inI1]() | |
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>> | |
aie.objectfifo.link [@memA] -> [@inA]() | |
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo.link [@memB] -> [@inB0,@inB1]() | |
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo.link [@outC0,outC1] -> [@memC]() | |
%core_0_2_ = aie.core(%tile_0_2) { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
%core_1_2_ = aie.core(%tile_1_2) { | |
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC1(Produce, 1) | |
aie.end | |
} {link_with = "mm.o"} | |
%core_0_2 = aie.core(%tile_0_2) { | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI0(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
} | |
aie.end | |
} {link_with = "mm.o"} | |
%core_1_2 = aie.core(%tile_1_2) { | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI1(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC1(Produce, 1) | |
} | |
aie.end | |
} {link_with = "mm.o"} | |
func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) { | |
aiex.ipu.load(%core_0_2_) | |
aiex.ipu.load(%core_1_2_) | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
aiex.ipu.load(%core_0_2) | |
aiex.ipu.load(%core_1_2) | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
} | |
return | |
} | |
@matmul(%1, %0, %alloca) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
%2 = bufferization.to_tensor %alloca : memref<32x64xi32> | |
memref.dealloc %alloc_4 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_3 : memref<64x64xi32, 1> | |
memref.dealloc %alloc_2 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2> | |
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2> | |
memref.dealloc %alloc : memref<4x8x8x8xi32, 2> | |
return %2 : tensor<32x64xi32> | |
} | |
} | |
// -----// IR Dump After AieCombineCoreCode (Conceptually, could be multiple transformations) //----- // | |
// Here, we try to combine core code into a single code block (will become a single elf) to avoid reloading `main` | |
#map = affine_map<(d0) -> (d0 * 32)> | |
#map1 = affine_map<(d0) -> (d0 * 64)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> | |
module { | |
func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> { | |
%c512 = arith.constant 512 : index | |
%c1 = arith.constant 1 : index | |
%c32 = arith.constant 32 : index | |
%c256 = arith.constant 256 : index | |
%c4 = arith.constant 4 : index | |
%c8 = arith.constant 8 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c1024 = arith.constant 1024 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>> | |
%1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>> | |
%alloc = memref.alloc() : memref<4x8x8x8xi32, 2> | |
%alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2> | |
%alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2> | |
%alloc_2 = memref.alloc() : memref<32x64xi32, 1> | |
%alloc_3 = memref.alloc() : memref<64x64xi32, 1> | |
%alloc_4 = memref.alloc() : memref<32x64xi32, 1> | |
%alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32> | |
scf.forall (%arg2, %arg3) in (1, 1) { | |
%3 = affine.apply #map(%arg2) | |
%4 = affine.apply #map1(%arg3) | |
%tile_0_1 = aie.tile(0, 1) | |
%tile_0_2 = aie.tile(0, 2) | |
%tile_1_1 = aie.tile(1, 1) | |
%tile_1_2 = aie.tile(1, 2) | |
aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>> | |
aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>> | |
aie.objectfifo.link [@memI] -> [@inI0,@inI1]() | |
aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>> | |
aie.objectfifo.link [@memA] -> [@inA]() | |
aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>> | |
aie.objectfifo.link [@memB] -> [@inB0,@inB1]() | |
aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>> | |
aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>> | |
aie.objectfifo.link [@outC0,outC1] -> [@memC]() | |
%core_0_2 = aie.core(%tile_0_2) { | |
%0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
%6 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%8 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%10 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%12 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%9, %11, %13, %7) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI0(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB0(Consume, 1) | |
aie.objectfifo.release @outC0(Produce, 1) | |
} | |
aie.end | |
} {link_with = "mm.o"} | |
%core_1_2 = aie.core(%tile_1_2) { | |
%0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @fill(%1) : (memref<4x8x4x8xi32>) -> () | |
func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC1(Produce, 1) | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
%6 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%8 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>> | |
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32> | |
%10 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>> | |
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32> | |
%12 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>> | |
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32> | |
func.call @matmul_vectorized_i32(%9, %11, %13, %7) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> () | |
aie.objectfifo.release @inI1(Consume, 1) | |
aie.objectfifo.release @inA(Consume, 1) | |
aie.objectfifo.release @inB1(Consume, 1) | |
aie.objectfifo.release @outC1(Produce, 1) | |
} | |
aie.end | |
} {link_with = "mm.o"} | |
func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) { | |
aiex.ipu.load(%core_0_2) | |
aiex.ipu.load(%core_1_2) | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
scf.for %arg4 = %c64 to %c1024 step %c64 { | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32> | |
aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32> | |
aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} | |
} | |
return | |
} | |
@matmul(%1, %0, %alloca) | |
} {mapping = [#gpu.block<y>, #gpu.block<x>]} | |
%2 = bufferization.to_tensor %alloca : memref<32x64xi32> | |
memref.dealloc %alloc_4 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_3 : memref<64x64xi32, 1> | |
memref.dealloc %alloc_2 : memref<32x64xi32, 1> | |
memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2> | |
memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2> | |
memref.dealloc %alloc : memref<4x8x8x8xi32, 2> | |
return %2 : tensor<32x64xi32> | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment