jtuyls/peeled_matmul_to_aie.mlir Secret

## peeled_matmul_to_aie.mlir
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
  func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
    %c64 = arith.constant 64 : index
    %c1024 = arith.constant 1024 : index
    %c0_i32 = arith.constant 0 : i32
    %0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
    %1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
    %alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
    %alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
    %alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
    %alloc_2 = memref.alloc() : memref<32x64xi32, 1>
    %alloc_3 = memref.alloc() : memref<64x64xi32, 1>
    %alloc_4 = memref.alloc() : memref<32x64xi32, 1>
    %alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
    scf.forall (%arg2, %arg3) in (1, 1) {
      %3 = affine.apply #map(%arg2)
      %4 = affine.apply #map1(%arg3)
      %subview = memref.subview %alloca[%3, %4] [32, 64] [1, 1] : memref<32x64xi32> to memref<32x64xi32, strided<[64, 1], offset: ?>>
      %subview_5 = memref.subview %1[0, 0] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
      %subview_6 = memref.subview %0[0, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
      linalg.copy ins(%subview_5 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>)
      linalg.copy ins(%subview_6 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>)
      scf.forall (%arg4, %arg5) in (1, 2) {
        %5 = affine.apply #map(%arg4)
        %6 = affine.apply #map(%arg5)
        %subview_7 = memref.subview %alloc_4[%5, 0] [32, 64] [1, 1] : memref<32x64xi32, 1> to memref<32x64xi32, strided<[64, 1], offset: ?>, 1>
        %subview_8 = memref.subview %alloc_3[0, %6] [64, 32] [1, 1] : memref<64x64xi32, 1> to memref<64x32xi32, strided<[64, 1], offset: ?>, 1>
        %subview_9 = memref.subview %alloc_2[%5, %6] [32, 32] [1, 1] : memref<32x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<4x8x4x8xi32, 2>)
        iree_linalg_ext.pack %subview_7 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_0 : (memref<32x64xi32, strided<[64, 1], offset: ?>, 1> memref<8x8x4x8xi32, 2>)
        iree_linalg_ext.pack %subview_8 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc : (memref<64x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x8x8x8xi32, 2>)
        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
        ^bb0(%in: i32, %in_10: i32, %out: i32):
          %7 = arith.muli %in, %in_10 : i32
          %8 = arith.addi %out, %7 : i32
          linalg.yield %8 : i32
        }
        iree_linalg_ext.unpack %alloc_1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_9 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[64, 1], offset: ?>, 1>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>)
      scf.for %arg4 = %c64 to %c1024 step %c64 {
        %subview_7 = memref.subview %1[0, %arg4] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
        %subview_8 = memref.subview %0[%arg4, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
        linalg.copy ins(%subview_7 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>)
        linalg.copy ins(%subview_8 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>)
        linalg.copy ins(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) outs(%alloc_2 : memref<32x64xi32, 1>)
        scf.forall (%arg5, %arg6) in (1, 2) {
          %5 = affine.apply #map(%arg5)
          %6 = affine.apply #map(%arg6)
          %subview_9 = memref.subview %alloc_4[%5, 0] [32, 64] [1, 1] : memref<32x64xi32, 1> to memref<32x64xi32, strided<[64, 1], offset: ?>, 1>
          %subview_10 = memref.subview %alloc_3[0, %6] [64, 32] [1, 1] : memref<64x64xi32, 1> to memref<64x32xi32, strided<[64, 1], offset: ?>, 1>
          %subview_11 = memref.subview %alloc_2[%5, %6] [32, 32] [1, 1] : memref<32x64xi32, 1> to memref<32x32xi32, strided<[64, 1], offset: ?>, 1>
          iree_linalg_ext.pack %subview_11 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_1 : (memref<32x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x8x4x8xi32, 2>)
          iree_linalg_ext.pack %subview_9 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_0 : (memref<32x64xi32, strided<[64, 1], offset: ?>, 1> memref<8x8x4x8xi32, 2>)
          iree_linalg_ext.pack %subview_10 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc : (memref<64x32xi32, strided<[64, 1], offset: ?>, 1> memref<4x8x8x8xi32, 2>)
          linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
          ^bb0(%in: i32, %in_12: i32, %out: i32):
            %7 = arith.muli %in, %in_12 : i32
            %8 = arith.addi %out, %7 : i32
            linalg.yield %8 : i32
          }
          iree_linalg_ext.unpack %alloc_1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_11 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[64, 1], offset: ?>, 1>)
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
        linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>)
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %2 = bufferization.to_tensor %alloca : memref<32x64xi32>
    memref.dealloc %alloc_4 : memref<32x64xi32, 1>
    memref.dealloc %alloc_3 : memref<64x64xi32, 1>
    memref.dealloc %alloc_2 : memref<32x64xi32, 1>
    memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
    memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
    memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
    return %2 : tensor<32x64xi32>
  }
}

// -----// IR Dump After PackToDma (iree-amdaie-pack-to-dma) //----- //
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
  func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
    %c512 = arith.constant 512 : index
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index
    %c256 = arith.constant 256 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c0 = arith.constant 0 : index
    %c64 = arith.constant 64 : index
    %c1024 = arith.constant 1024 : index
    %c0_i32 = arith.constant 0 : i32
    %0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
    %1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
    %alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
    %alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
    %alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
    %alloc_2 = memref.alloc() : memref<32x64xi32, 1>
    %alloc_3 = memref.alloc() : memref<64x64xi32, 1>
    %alloc_4 = memref.alloc() : memref<32x64xi32, 1>
    %alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
    scf.forall (%arg2, %arg3) in (1, 1) {
      %3 = affine.apply #map(%arg2)
      %4 = affine.apply #map1(%arg3)
      %subview = memref.subview %alloca[%3, %4] [32, 64] [1, 1] : memref<32x64xi32> to memref<32x64xi32, strided<[64, 1], offset: ?>>
      %subview_5 = memref.subview %1[0, 0] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
      %subview_6 = memref.subview %0[0, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
      linalg.copy ins(%subview_5 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>)
      linalg.copy ins(%subview_6 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>)
      scf.forall (%arg4, %arg5) in (1, 2) {
        %5 = affine.apply #map(%arg4)
        %6 = affine.apply #map(%arg5)
        linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<4x8x4x8xi32, 2>)
        air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %alloc_4[%c0, %c0, %5, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (memref<8x8x4x8xi32, 2>, memref<32x64xi32, 1>)
        air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %alloc_3[%c0, %c0, %c0, %6] [%c4, %c8, %c8, %c8] [%c8, %c512, %c64, %c1]) : (memref<4x8x8x8xi32, 2>, memref<64x64xi32, 1>)
        linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
        ^bb0(%in: i32, %in_7: i32, %out: i32):
          %7 = arith.muli %in, %in_7 : i32
          %8 = arith.addi %out, %7 : i32
          linalg.yield %8 : i32
        }
        air.dma_memcpy_nd (%alloc_2[%5, %6] [%c32, %c32] [%c64, %c1], %alloc_1[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (memref<32x64xi32, 1>, memref<4x8x4x8xi32, 2>)
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>)
      scf.for %arg4 = %c64 to %c1024 step %c64 {
        %subview_7 = memref.subview %1[0, %arg4] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
        %subview_8 = memref.subview %0[%arg4, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>
        linalg.copy ins(%subview_7 : memref<32x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>)
        linalg.copy ins(%subview_8 : memref<64x64xi32, strided<[?, ?], offset: ?>>) outs(%alloc_3 : memref<64x64xi32, 1>)
        linalg.copy ins(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>) outs(%alloc_2 : memref<32x64xi32, 1>)
        scf.forall (%arg5, %arg6) in (1, 2) {
          %5 = affine.apply #map(%arg5)
          %6 = affine.apply #map(%arg6)
          air.dma_memcpy_nd (%alloc_1[%c0, %c0, %c0, %c0] [%c4, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %alloc_2[%c0, %c0, %5, %6] [%c4, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (memref<4x8x4x8xi32, 2>, memref<32x64xi32, 1>)
          air.dma_memcpy_nd (%alloc_0[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %alloc_4[%c0, %c0, %5, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (memref<8x8x4x8xi32, 2>, memref<32x64xi32, 1>)
          air.dma_memcpy_nd (%alloc[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %alloc_3[%c0, %c0, %c0, %6] [%c4, %c8, %c8, %c8] [%c8, %c512, %c64, %c1]) : (memref<4x8x8x8xi32, 2>, memref<64x64xi32, 1>)
          linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
          ^bb0(%in: i32, %in_9: i32, %out: i32):
            %7 = arith.muli %in, %in_9 : i32
            %8 = arith.addi %out, %7 : i32
            linalg.yield %8 : i32
          }
          air.dma_memcpy_nd (%alloc_2[%5, %6] [%c32, %c32] [%c64, %c1], %alloc_1[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (memref<32x64xi32, 1>, memref<4x8x4x8xi32, 2>)
        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
        linalg.copy ins(%alloc_2 : memref<32x64xi32, 1>) outs(%subview : memref<32x64xi32, strided<[64, 1], offset: ?>>)
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %2 = bufferization.to_tensor %alloca : memref<32x64xi32>
    memref.dealloc %alloc_4 : memref<32x64xi32, 1>
    memref.dealloc %alloc_3 : memref<64x64xi32, 1>
    memref.dealloc %alloc_2 : memref<32x64xi32, 1>
    memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
    memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
    memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
    return %2 : tensor<32x64xi32>
  }
}

// -----// IR Dump After DmaToObjectFifo (Conceptually, could be multiple transformations) //----- //
// Here, we insert the objectfifo state machines.
//
// Skipped steps among others:
// - Tile assignment to object fifo's, note that this potentially impacts complexity of figuring out whether objectfifos can be reused across peeled - nonpeeled part of reduction loop.
//   -> Could potentially be solved using logical objectfifos on buffers/memories, with partial tile constraints. For example only on the tile destination side as this information would be needed as a starting point
//      to derive which objectfifos could potentially be merged.
// - The aiex.ipu.load(%core) operation doesn't exist yet as far as I can tell. This would be needed to reload the AIE core `main` within a single xclbin.
//
// NOTES:
// - Parallized core code could still be represented more compactly at this stage using scf.forall instead of unrolling
// - Core code could still be represented using linalg.generic in earlier stages
// - Tiles and cores are non-ssa for simplicity here (tile_0_1, core_0_2)
// - Can objectfifos support multi producer - single consumer?
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
  func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
    %c512 = arith.constant 512 : index
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index
    %c256 = arith.constant 256 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c0 = arith.constant 0 : index
    %c64 = arith.constant 64 : index
    %c1024 = arith.constant 1024 : index
    %c0_i32 = arith.constant 0 : i32
    %0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
    %1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
    %alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
    %alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
    %alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
    %alloc_2 = memref.alloc() : memref<32x64xi32, 1>
    %alloc_3 = memref.alloc() : memref<64x64xi32, 1>
    %alloc_4 = memref.alloc() : memref<32x64xi32, 1>
    %alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
    scf.forall (%arg2, %arg3) in (1, 1) {
      %3 = affine.apply #map(%arg2)
      %4 = affine.apply #map1(%arg3)
      %subview = memref.subview %alloca[%3, %4] [32, 64] [1, 1] : memref<32x64xi32> to memref<32x64xi32, strided<[64, 1], offset: ?>>
      %subview_5 = memref.subview %1[0, 0] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
      %subview_6 = memref.subview %0[0, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>

      %tile_0_1 = aie.tile(0, 1)
      %tile_0_2 = aie.tile(0, 2)
      %tile_1_1 = aie.tile(1, 1)
      %tile_1_2 = aie.tile(1, 2)
      aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
      aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
      aie.objectfifo.link [@memA] -> [@inA]()
      aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
      aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo.link [@outC0,outC1] -> [@memC]()
      %core_0_2 = aie.core(%tile_0_2) {
        %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB0(Consume, 1)
        aie.objectfifo.release @outC0(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      %core_1_2 = aie.core(%tile_1_2) {
        %0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB1(Consume, 1)
        aie.objectfifo.release @outC1(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      func.func @fill_matmul(%arg0: memref<32x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<32x64xi32>) {
        aiex.ipu.load(%core_0_2)
        aiex.ipu.load(%core_1_2)
        aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
        aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
        return
      }
      @fill_matmul(%subview_5, %subview_6, %subview)

      scf.for %arg4 = %c64 to %c1024 step %c64 {
        %subview_7 = memref.subview %1[0, %arg4] [32, 64] [1, 1] : memref<32x1024xi32, strided<[?, ?], offset: ?>> to memref<32x64xi32, strided<[?, ?], offset: ?>>
        %subview_8 = memref.subview %0[%arg4, 0] [64, 64] [1, 1] : memref<1024x64xi32, strided<[?, ?], offset: ?>> to memref<64x64xi32, strided<[?, ?], offset: ?>>

        %tile_0_1 = aie.tile(0, 1)
        %tile_0_2 = aie.tile(0, 2)
        %tile_1_1 = aie.tile(1, 1)
        %tile_1_2 = aie.tile(1, 2)
        aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
        aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
        aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
        aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
        aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
        aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
        aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
        aie.objectfifo.link [@memA] -> [@inA]()
        aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
        aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
        aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
        aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
        aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
        aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
        aie.objectfifo.link [@outC0,outC1] -> [@memC]()
        %core_0_2 = aie.core(%tile_0_2) {
          %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
          %6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
          func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
          aie.objectfifo.release @inI0(Consume, 1)
          aie.objectfifo.release @inA(Consume, 1)
          aie.objectfifo.release @inB0(Consume, 1)
          aie.objectfifo.release @outC0(Produce, 1)
          aie.end
        } {link_with = "mm.o"}
        %core_1_2 = aie.core(%tile_1_2) {
          %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %2 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
          %6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
          func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
          aie.objectfifo.release @inI1(Consume, 1)
          aie.objectfifo.release @inA(Consume, 1)
          aie.objectfifo.release @inB1(Consume, 1)
          aie.objectfifo.release @outC0(Produce, 1)
          aie.end
        } {link_with = "mm.o"}
        func.func @matmul(%arg0: memref<32x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<32x64xi32>) {
          aiex.ipu.load(%core_0_2)
          aiex.ipu.load(%core_1_2)
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
          aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
          return
        }
        @matmul(%subview_7, %subview_8, %subview)
      }
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %2 = bufferization.to_tensor %alloca : memref<32x64xi32>
    memref.dealloc %alloc_4 : memref<32x64xi32, 1>
    memref.dealloc %alloc_3 : memref<64x64xi32, 1>
    memref.dealloc %alloc_2 : memref<32x64xi32, 1>
    memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
    memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
    memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
    return %2 : tensor<32x64xi32>
  }
}


// -----// IR Dump After AieBringInLoopsIntoAieControlCode (Conceptually, could be multiple transformations) //----- //
// Steps:
// - Move subviews into aiex.ipu.dma_memcpy_nd
// - Move loops into AIE control code function
// NOTES:
// - aiex.ipu.dma_memcpy_nd strides seem to be 3-dimensional in examples (stride0 is assumed to be always 1?), I use four dimensions here, similar to air.dma_memcpy_nd
// - Tiles and cores are non-ssa for simplicity here (tile_0_1, core_0_2)
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
  func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
    %c512 = arith.constant 512 : index
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index
    %c256 = arith.constant 256 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c0 = arith.constant 0 : index
    %c64 = arith.constant 64 : index
    %c1024 = arith.constant 1024 : index
    %c0_i32 = arith.constant 0 : i32
    %0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
    %1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
    %alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
    %alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
    %alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
    %alloc_2 = memref.alloc() : memref<32x64xi32, 1>
    %alloc_3 = memref.alloc() : memref<64x64xi32, 1>
    %alloc_4 = memref.alloc() : memref<32x64xi32, 1>
    %alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
    scf.forall (%arg2, %arg3) in (1, 1) {
      %3 = affine.apply #map(%arg2)
      %4 = affine.apply #map1(%arg3)

      %tile_0_1 = aie.tile(0, 1)
      %tile_0_2 = aie.tile(0, 2)
      %tile_1_1 = aie.tile(1, 1)
      %tile_1_2 = aie.tile(1, 2)
      aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
      aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
      aie.objectfifo.link [@memA] -> [@inA]()
      aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
      aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo.link [@outC0,outC1] -> [@memC]()
      %core_0_2 = aie.core(%tile_0_2) {
        %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB0(Consume, 1)
        aie.objectfifo.release @outC0(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      %core_1_2 = aie.core(%tile_1_2) {
        %0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB1(Consume, 1)
        aie.objectfifo.release @outC1(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      func.func @fill_matmul(%arg0: memref<32x64xi32>, %arg1: memref<64x64xi32>, %arg2: memref<32x64xi32>) {
        aiex.ipu.load(%core_0_2)
        aiex.ipu.load(%core_1_2)
        aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
        aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
        return
      }
      fill_matmul(%1, %0, %alloca)

      %tile_0_1 = aie.tile(0, 1)
      %tile_0_2 = aie.tile(0, 2)
      %tile_1_1 = aie.tile(1, 1)
      %tile_1_2 = aie.tile(1, 2)
      aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
      aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
      aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
      aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
      aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
      aie.objectfifo.link [@memA] -> [@inA]()
      aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
      aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo.link [@outC0,outC1] -> [@memC]()
      %core_0_2 = aie.core(%tile_0_2) {
        %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inI0(Consume, 1)
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB0(Consume, 1)
        aie.objectfifo.release @outC0(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      %core_1_2 = aie.core(%tile_1_2) {
        %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inI1(Consume, 1)
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB1(Consume, 1)
        aie.objectfifo.release @outC0(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          aiex.ipu.load(%core_0_2)
          aiex.ipu.load(%core_1_2)
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
          aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
        }
        return
      }
      @matmul(%1, %0, %subview)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %2 = bufferization.to_tensor %alloca : memref<32x64xi32>
    memref.dealloc %alloc_4 : memref<32x64xi32, 1>
    memref.dealloc %alloc_3 : memref<64x64xi32, 1>
    memref.dealloc %alloc_2 : memref<32x64xi32, 1>
    memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
    memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
    memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
    return %2 : tensor<32x64xi32>
  }
}

// -----// IR Dump After AieBringInLoopsIntoAieCoreCode (Conceptually, could be multiple transformations) //----- //
// Move loops into AIE core code to avoid reloading the cores' `main` program.
// Steps:
// - Move subviews into aiex.ipu.dma_memcpy_nd
// - Move loops into AIE control code function
// NOTES:
// - aiex.ipu.dma_memcpy_nd strides seem to be 3-dimensional in examples (stride0 is assumed to be always 1?), I use four dimensions here, similar to air.dma_memcpy_nd
// - Tiles and cores are non-ssa for simplicity here (tile_0_1, core_0_2)
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
  func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
    %c512 = arith.constant 512 : index
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index
    %c256 = arith.constant 256 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c0 = arith.constant 0 : index
    %c64 = arith.constant 64 : index
    %c1024 = arith.constant 1024 : index
    %c0_i32 = arith.constant 0 : i32
    %0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
    %1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
    %alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
    %alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
    %alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
    %alloc_2 = memref.alloc() : memref<32x64xi32, 1>
    %alloc_3 = memref.alloc() : memref<64x64xi32, 1>
    %alloc_4 = memref.alloc() : memref<32x64xi32, 1>
    %alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
    scf.forall (%arg2, %arg3) in (1, 1) {
      %3 = affine.apply #map(%arg2)
      %4 = affine.apply #map1(%arg3)

      %tile_0_1 = aie.tile(0, 1)
      %tile_0_2 = aie.tile(0, 2)
      %tile_1_1 = aie.tile(1, 1)
      %tile_1_2 = aie.tile(1, 2)
      aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
      aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
      aie.objectfifo.link [@memA] -> [@inA]()
      aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
      aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo.link [@outC0,outC1] -> [@memC]()
      %core_0_2 = aie.core(%tile_0_2) {
        %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB0(Consume, 1)
        aie.objectfifo.release @outC0(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      %core_1_2 = aie.core(%tile_1_2) {
        %0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB1(Consume, 1)
        aie.objectfifo.release @outC1(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      func.func @fill_matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
        aiex.ipu.load(%core_0_2)
        aiex.ipu.load(%core_1_2)
        aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
        aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
        return
      }
      @fill_matmul(%1, %0, %alloca)

      %tile_0_1 = aie.tile(0, 1)
      %tile_0_2 = aie.tile(0, 2)
      %tile_1_1 = aie.tile(1, 1)
      %tile_1_2 = aie.tile(1, 2)
      aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
      aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
      aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
      aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
      aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
      aie.objectfifo.link [@memA] -> [@inA]()
      aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
      aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo.link [@outC0,outC1] -> [@memC]()
      %core_0_2 = aie.core(%tile_0_2) {
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
          %6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
          func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
          aie.objectfifo.release @inI0(Consume, 1)
          aie.objectfifo.release @inA(Consume, 1)
          aie.objectfifo.release @inB0(Consume, 1)
          aie.objectfifo.release @outC0(Produce, 1)
        }
        aie.end
      } {link_with = "mm.o"}
      %core_1_2 = aie.core(%tile_1_2) {
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %2 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
          %6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
          func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
          aie.objectfifo.release @inI1(Consume, 1)
          aie.objectfifo.release @inA(Consume, 1)
          aie.objectfifo.release @inB1(Consume, 1)
          aie.objectfifo.release @outC0(Produce, 1)
        }
        aie.end
      } {link_with = "mm.o"}
      func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
        aiex.ipu.load(%core_0_2)
        aiex.ipu.load(%core_1_2)
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
          aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
        }
        return
      }
      @matmul(%1, %0, %alloca)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %2 = bufferization.to_tensor %alloca : memref<32x64xi32>
    memref.dealloc %alloc_4 : memref<32x64xi32, 1>
    memref.dealloc %alloc_3 : memref<64x64xi32, 1>
    memref.dealloc %alloc_2 : memref<32x64xi32, 1>
    memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
    memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
    memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
    return %2 : tensor<32x64xi32>
  }
}


// -----// IR Dump After AieSuperimposeStateMachines (Conceptually, could be multiple transformations) //----- //
// Here, we check for overlapping objectfifo state machines, like we see above (peeled fill + matmul is a 'sub' state machine of unpeeled matmul)
// and keep the 'super' state machine
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
  func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
    %c512 = arith.constant 512 : index
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index
    %c256 = arith.constant 256 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c0 = arith.constant 0 : index
    %c64 = arith.constant 64 : index
    %c1024 = arith.constant 1024 : index
    %c0_i32 = arith.constant 0 : i32
    %0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
    %1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
    %alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
    %alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
    %alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
    %alloc_2 = memref.alloc() : memref<32x64xi32, 1>
    %alloc_3 = memref.alloc() : memref<64x64xi32, 1>
    %alloc_4 = memref.alloc() : memref<32x64xi32, 1>
    %alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
    scf.forall (%arg2, %arg3) in (1, 1) {
      %3 = affine.apply #map(%arg2)
      %4 = affine.apply #map1(%arg3)

      %tile_0_1 = aie.tile(0, 1)
      %tile_0_2 = aie.tile(0, 2)
      %tile_1_1 = aie.tile(1, 1)
      %tile_1_2 = aie.tile(1, 2)
      aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
      aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
      aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
      aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
      aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
      aie.objectfifo.link [@memA] -> [@inA]()
      aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
      aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo.link [@outC0,outC1] -> [@memC]()
      %core_0_2_ = aie.core(%tile_0_2) {
        %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB0(Consume, 1)
        aie.objectfifo.release @outC0(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      %core_1_2_ = aie.core(%tile_1_2) {
        %0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB1(Consume, 1)
        aie.objectfifo.release @outC1(Produce, 1)
        aie.end
      } {link_with = "mm.o"}
      %core_0_2 = aie.core(%tile_0_2) {
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %2 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
          %6 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
          func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
          aie.objectfifo.release @inI0(Consume, 1)
          aie.objectfifo.release @inA(Consume, 1)
          aie.objectfifo.release @inB0(Consume, 1)
          aie.objectfifo.release @outC0(Produce, 1)
        }
        aie.end
      } {link_with = "mm.o"}
      %core_1_2 = aie.core(%tile_1_2) {
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          %0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %2 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %4 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
          %6 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
          func.call @matmul_vectorized_i32(%3, %5, %7, %1) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
          aie.objectfifo.release @inI1(Consume, 1)
          aie.objectfifo.release @inA(Consume, 1)
          aie.objectfifo.release @inB1(Consume, 1)
          aie.objectfifo.release @outC1(Produce, 1)
        }
        aie.end
      } {link_with = "mm.o"}
      func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
        aiex.ipu.load(%core_0_2_)
        aiex.ipu.load(%core_1_2_)
        aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
        aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
        aiex.ipu.load(%core_0_2)
        aiex.ipu.load(%core_1_2)
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
          aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
        }
        return
      }
      @matmul(%1, %0, %alloca)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %2 = bufferization.to_tensor %alloca : memref<32x64xi32>
    memref.dealloc %alloc_4 : memref<32x64xi32, 1>
    memref.dealloc %alloc_3 : memref<64x64xi32, 1>
    memref.dealloc %alloc_2 : memref<32x64xi32, 1>
    memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
    memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
    memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
    return %2 : tensor<32x64xi32>
  }
}


// -----// IR Dump After AieCombineCoreCode (Conceptually, could be multiple transformations) //----- //
// Here, we try to combine core code into a single code block (will become a single elf) to avoid reloading `main`
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 64)>
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
module {
  func.func @matmul_i32(%arg0: tensor<32x1024xi32>, %arg1: tensor<1024x64xi32>) -> tensor<32x64xi32> {
    %c512 = arith.constant 512 : index
    %c1 = arith.constant 1 : index
    %c32 = arith.constant 32 : index
    %c256 = arith.constant 256 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c0 = arith.constant 0 : index
    %c64 = arith.constant 64 : index
    %c1024 = arith.constant 1024 : index
    %c0_i32 = arith.constant 0 : i32
    %0 = bufferization.to_memref %arg1 : memref<1024x64xi32, strided<[?, ?], offset: ?>>
    %1 = bufferization.to_memref %arg0 : memref<32x1024xi32, strided<[?, ?], offset: ?>>
    %alloc = memref.alloc() : memref<4x8x8x8xi32, 2>
    %alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2>
    %alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2>
    %alloc_2 = memref.alloc() : memref<32x64xi32, 1>
    %alloc_3 = memref.alloc() : memref<64x64xi32, 1>
    %alloc_4 = memref.alloc() : memref<32x64xi32, 1>
    %alloca = memref.alloca() {alignment = 64 : i64} : memref<32x64xi32>
    scf.forall (%arg2, %arg3) in (1, 1) {
      %3 = affine.apply #map(%arg2)
      %4 = affine.apply #map1(%arg3)

      %tile_0_1 = aie.tile(0, 1)
      %tile_0_2 = aie.tile(0, 2)
      %tile_1_1 = aie.tile(1, 1)
      %tile_1_2 = aie.tile(1, 2)
      aie.objectfifo @memA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @memB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
      aie.objectfifo @memI(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo @inI0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
      aie.objectfifo @inI1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 4, stride = 8>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x4x8xi32>>
      aie.objectfifo.link [@memI] -> [@inI0,@inI1]()
      aie.objectfifo @inA(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 4, stride = 64>, <size = 8, stride = 256, <size = 8, stride = 8>>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<8x8x4x8xi32>>
      aie.objectfifo.link [@memA] -> [@inA]()
      aie.objectfifo @inB0(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo @inB1(%tile_0_1 toStream [<size = 8, stride = 1>, <size = 8, stride = 64>, <size = 8, stride = 512, <size = 8, stride = 4>>], {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<4x8x8x8xi32>>
      aie.objectfifo.link [@memB] -> [@inB0,@inB1]()
      aie.objectfifo @outC0(%tile_0_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @outC1(%tile_1_2 toStream [<size = 8, stride = 1>, <size = 4, stride = 256>, <size = 4, stride = 8, <size = 8, stride = 32>>], {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xi32>>
      aie.objectfifo @memC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x64xi32>>
      aie.objectfifo.link [@outC0,outC1] -> [@memC]()
      %core_0_2 = aie.core(%tile_0_2) {
        %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB0(Consume, 1)
        aie.objectfifo.release @outC0(Produce, 1)
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          %6 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %8 = aie.objectfifo.acquire @inI0(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %10 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
          %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
          %12 = aie.objectfifo.acquire @inB0(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
          func.call @matmul_vectorized_i32(%9, %11, %13, %7) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
          aie.objectfifo.release @inI0(Consume, 1)
          aie.objectfifo.release @inA(Consume, 1)
          aie.objectfifo.release @inB0(Consume, 1)
          aie.objectfifo.release @outC0(Produce, 1)
        }
        aie.end
      } {link_with = "mm.o"}
      %core_1_2 = aie.core(%tile_1_2) {
        %0 = aie.objectfifo.acquire @outC1(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
        %2 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
        %4 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
        %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
        func.call @fill(%1) : (memref<4x8x4x8xi32>) -> ()
        func.call @matmul_vectorized_i32(%3, %5, %1) : (memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
        aie.objectfifo.release @inA(Consume, 1)
        aie.objectfifo.release @inB1(Consume, 1)
        aie.objectfifo.release @outC1(Produce, 1)
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          %6 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %8 = aie.objectfifo.acquire @inI1(Consume, 1) : !aie.objectfifosubview<memref<4x8x4x8xi32>>
          %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<4x8x4x8xi32>> -> memref<4x8x4x8xi32>
          %10 = aie.objectfifo.acquire @inA(Consume, 1) : !aie.objectfifosubview<memref<8x8x4x8xi32>>
          %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<8x8x4x8xi32>> -> memref<8x8x4x8xi32>
          %12 = aie.objectfifo.acquire @inB1(Consume, 1) : !aie.objectfifosubview<memref<4x8x8x8xi32>>
          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<4x8x8x8xi32>> -> memref<4x8x8x8xi32>
          func.call @matmul_vectorized_i32(%9, %11, %13, %7) : (memref<4x8x4x8xi32>, memref<8x8x4x8xi32>, memref<4x8x8x8xi32>, memref<4x8x4x8xi32>) -> ()
          aie.objectfifo.release @inI1(Consume, 1)
          aie.objectfifo.release @inA(Consume, 1)
          aie.objectfifo.release @inB1(Consume, 1)
          aie.objectfifo.release @outC1(Produce, 1)
        }
        aie.end
      } {link_with = "mm.o"}
      func.func @matmul(%arg0: memref<32x1024xi32>, %arg1: memref<1024x64xi32>, %arg2: memref<32x64xi32>) {
        aiex.ipu.load(%core_0_2)
        aiex.ipu.load(%core_1_2)
        aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
        aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
        aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
        scf.for %arg4 = %c64 to %c1024 step %c64 {
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 2 : i64, metadata = @memI} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, %arg4][1, 1, 32, 64][0, 0, 1024, 1]) {id = 2 : i64, metadata = @memA} : memref<32x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg1[0, 0, %arg4, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @memB} : memref<64x64xi32>
          aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0]) {id = 0 : i64, metadata = @memC} : memref<32x64xi32>
          aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
        }
        return
      }
      @matmul(%1, %0, %alloca)
    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
    %2 = bufferization.to_tensor %alloca : memref<32x64xi32>
    memref.dealloc %alloc_4 : memref<32x64xi32, 1>
    memref.dealloc %alloc_3 : memref<64x64xi32, 1>
    memref.dealloc %alloc_2 : memref<32x64xi32, 1>
    memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2>
    memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2>
    memref.dealloc %alloc : memref<4x8x8x8xi32, 2>
    return %2 : tensor<32x64xi32>
  }
}