Skip to content

Instantly share code, notes, and snippets.

@jtuyls
Last active July 2, 2024 12:37
Show Gist options
  • Save jtuyls/0d46284d9d3c5780bd298ad7de4d88a3 to your computer and use it in GitHub Desktop.
Save jtuyls/0d46284d9d3c5780bd298ad7de4d88a3 to your computer and use it in GitHub Desktop.
Matmul bf16 vec with amdaie-objectFifo-stateful-transform
${IREE_BUILD_DIR}/tools/iree-opt %s --mlir-print-ir-before-all --amdaie-objectFifo-stateful-transform
aie.device(npu1_4col) {
%tile_0_2 = aie.tile(0, 2)
%tile_0_3 = aie.tile(0, 3)
%tile_1_2 = aie.tile(1, 2)
%tile_1_3 = aie.tile(1, 3)
%tile_0_0 = aie.tile(0, 0)
%tile_0_1 = aie.tile(0, 1)
aie.objectfifo @obj0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16, 1>>
aie.objectfifo @obj1(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16, 1>>
aie.objectfifo @obj2(%tile_0_1 toStream [<size = 8, stride = 4>, <size = 32, stride = 32>, <size = 4, stride = 1>], {%tile_0_2, %tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1>>
aie.objectfifo @obj3(%tile_0_1 toStream [<size = 8, stride = 4>, <size = 32, stride = 32>, <size = 4, stride = 1>], {%tile_1_2, %tile_1_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1>>
aie.objectfifo.link [@obj1] -> [@obj2, @obj3]()
aie.objectfifo @obj4(%tile_0_1 toStream [<size = 4, stride = 8>, <size = 32, stride = 32>, <size = 8, stride = 1>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1>>
aie.objectfifo @obj5(%tile_0_1 toStream [<size = 4, stride = 8>, <size = 32, stride = 32>, <size = 8, stride = 1>], {%tile_0_3, %tile_1_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1>>
aie.objectfifo.link [@obj0] -> [@obj4, @obj5]()
aie.objectfifo @obj6(%tile_0_2 toStream [<size = 32, stride = 4>, <size = 8, stride = 128>, <size = 4, stride = 1>], {%tile_0_1}, 4 : i32) : !aie.objectfifo<memref<1024xf32, 1>>
aie.objectfifo @obj7(%tile_1_2 toStream [<size = 32, stride = 4>, <size = 8, stride = 128>, <size = 4, stride = 1>], {%tile_0_1}, 4 : i32) : !aie.objectfifo<memref<1024xf32, 1>>
aie.objectfifo @obj8(%tile_0_3 toStream [<size = 32, stride = 4>, <size = 8, stride = 128>, <size = 4, stride = 1>], {%tile_0_1}, 4 : i32) : !aie.objectfifo<memref<1024xf32, 1>>
aie.objectfifo @obj9(%tile_1_3 toStream [<size = 32, stride = 4>, <size = 8, stride = 128>, <size = 4, stride = 1>], {%tile_0_1}, 4 : i32) : !aie.objectfifo<memref<1024xf32, 1>>
aie.objectfifo @obj10(%tile_0_1 toStream [<size = 2, stride = 2048>, <size = 32, stride = 32>, <size = 2, stride = 1024>, <size = 32, stride = 1>], {%tile_0_0}, 4 : i32) : !aie.objectfifo<memref<4096xf32, 1>>
aie.objectfifo.link [@obj6, @obj7, @obj8, @obj9] -> [@obj10]()
%core_0_2 = aie.core(%tile_0_2) {
%c6 = arith.constant 6 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%cst = arith.constant 0.000000e+00 : bf16
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f32
scf.for %arg0 = %c0 to %c4 step %c1 {
%0 = aie.objectfifo.acquire @obj6(Produce, 1) : !aie.objectfifosubview<memref<1024xf32, 1>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xf32, 1>> -> memref<1024xf32, 1>
%reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 1> to memref<1x1x8x8x4x4xf32, 1>
%2 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
%4 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_2 = memref.reinterpret_cast %5 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
scf.for %arg4 = %c0 to %c4 step %c1 {
memref.store %cst_0, %reinterpret_cast[%c0, %c0, %arg1, %arg2, %arg3, %arg4] : memref<1x1x8x8x4x4xf32, 1>
}
}
}
}
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
%10 = vector.transfer_read %reinterpret_cast_2[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%11 = vector.transfer_read %reinterpret_cast_1[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%12 = vector.transfer_read %reinterpret_cast[%c0, %c0, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [true, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %15, %reinterpret_cast[%c0, %c0, %arg2, %arg1, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
scf.for %arg1 = %c0 to %c6 step %c1 {
aie.objectfifo.release @obj2(Consume, 1)
%10 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_5 = memref.reinterpret_cast %11 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
aie.objectfifo.release @obj4(Consume, 1)
%12 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_6 = memref.reinterpret_cast %13 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c8 step %c1 {
scf.for %arg4 = %c0 to %c4 step %c1 {
%14 = vector.transfer_read %reinterpret_cast_6[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%15 = vector.transfer_read %reinterpret_cast_5[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%16 = vector.transfer_read %reinterpret_cast[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%17 = arith.extf %14 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%18 = arith.extf %15 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %19, %reinterpret_cast[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
}
aie.objectfifo.release @obj2(Consume, 1)
%6 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_3 = memref.reinterpret_cast %7 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
aie.objectfifo.release @obj4(Consume, 1)
%8 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_4 = memref.reinterpret_cast %9 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
%10 = vector.transfer_read %reinterpret_cast_4[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%11 = vector.transfer_read %reinterpret_cast_3[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%12 = vector.transfer_read %reinterpret_cast[%c0, %c0, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [true, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %15, %reinterpret_cast[%c0, %c0, %arg2, %arg1, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
aie.objectfifo.release @obj4(Consume, 1)
aie.objectfifo.release @obj2(Consume, 1)
aie.objectfifo.release @obj6(Produce, 1)
}
aie.end
}
%core_1_2 = aie.core(%tile_1_2) {
%c6 = arith.constant 6 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : bf16
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f32
scf.for %arg0 = %c0 to %c4 step %c1 {
%0 = aie.objectfifo.acquire @obj7(Produce, 1) : !aie.objectfifosubview<memref<1024xf32, 1>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xf32, 1>> -> memref<1024xf32, 1>
%reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 1> to memref<1x1x8x8x4x4xf32, 1>
%2 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
%4 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_2 = memref.reinterpret_cast %5 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
scf.for %arg4 = %c0 to %c4 step %c1 {
memref.store %cst_0, %reinterpret_cast[%c0, %c0, %arg1, %arg2, %arg3, %arg4] : memref<1x1x8x8x4x4xf32, 1>
}
}
}
}
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
%10 = vector.transfer_read %reinterpret_cast_2[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%11 = vector.transfer_read %reinterpret_cast_1[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%12 = vector.transfer_read %reinterpret_cast[%c0, %c1, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [true, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %15, %reinterpret_cast[%c0, %c1, %arg2, %arg1, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
scf.for %arg1 = %c0 to %c6 step %c1 {
aie.objectfifo.release @obj3(Consume, 1)
%10 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_5 = memref.reinterpret_cast %11 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
aie.objectfifo.release @obj4(Consume, 1)
%12 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_6 = memref.reinterpret_cast %13 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c8 step %c1 {
scf.for %arg4 = %c0 to %c4 step %c1 {
%14 = vector.transfer_read %reinterpret_cast_6[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%15 = vector.transfer_read %reinterpret_cast_5[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%16 = vector.transfer_read %reinterpret_cast[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%17 = arith.extf %14 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%18 = arith.extf %15 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %19, %reinterpret_cast[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
}
aie.objectfifo.release @obj3(Consume, 1)
%6 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_3 = memref.reinterpret_cast %7 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
aie.objectfifo.release @obj4(Consume, 1)
%8 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_4 = memref.reinterpret_cast %9 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
%10 = vector.transfer_read %reinterpret_cast_4[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%11 = vector.transfer_read %reinterpret_cast_3[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%12 = vector.transfer_read %reinterpret_cast[%c0, %c1, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [true, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %15, %reinterpret_cast[%c0, %c1, %arg2, %arg1, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
aie.objectfifo.release @obj3(Consume, 1)
aie.objectfifo.release @obj4(Consume, 1)
aie.objectfifo.release @obj7(Produce, 1)
}
aie.end
}
%core_0_3 = aie.core(%tile_0_3) {
%c6 = arith.constant 6 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : bf16
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f32
scf.for %arg0 = %c0 to %c4 step %c1 {
%0 = aie.objectfifo.acquire @obj8(Produce, 1) : !aie.objectfifosubview<memref<1024xf32, 1>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xf32, 1>> -> memref<1024xf32, 1>
%reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 1> to memref<1x1x8x8x4x4xf32, 1>
%2 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
%4 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_2 = memref.reinterpret_cast %5 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
scf.for %arg4 = %c0 to %c4 step %c1 {
memref.store %cst_0, %reinterpret_cast[%c0, %c0, %arg1, %arg2, %arg3, %arg4] : memref<1x1x8x8x4x4xf32, 1>
}
}
}
}
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
%10 = vector.transfer_read %reinterpret_cast_2[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%11 = vector.transfer_read %reinterpret_cast_1[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%12 = vector.transfer_read %reinterpret_cast[%c1, %c0, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [false, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %15, %reinterpret_cast[%c1, %c0, %arg2, %arg1, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
scf.for %arg1 = %c0 to %c6 step %c1 {
aie.objectfifo.release @obj2(Consume, 1)
%10 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_5 = memref.reinterpret_cast %11 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
aie.objectfifo.release @obj5(Consume, 1)
%12 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_6 = memref.reinterpret_cast %13 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c8 step %c1 {
scf.for %arg4 = %c0 to %c4 step %c1 {
%14 = vector.transfer_read %reinterpret_cast_6[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%15 = vector.transfer_read %reinterpret_cast_5[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%16 = vector.transfer_read %reinterpret_cast[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst_0 {in_bounds = [false, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%17 = arith.extf %14 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%18 = arith.extf %15 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %19, %reinterpret_cast[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
}
aie.objectfifo.release @obj2(Consume, 1)
%6 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_3 = memref.reinterpret_cast %7 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
aie.objectfifo.release @obj5(Consume, 1)
%8 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_4 = memref.reinterpret_cast %9 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
%10 = vector.transfer_read %reinterpret_cast_4[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%11 = vector.transfer_read %reinterpret_cast_3[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%12 = vector.transfer_read %reinterpret_cast[%c1, %c0, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [false, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %15, %reinterpret_cast[%c1, %c0, %arg2, %arg1, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
aie.objectfifo.release @obj5(Consume, 1)
aie.objectfifo.release @obj2(Consume, 1)
aie.objectfifo.release @obj8(Produce, 1)
}
aie.end
}
%core_1_3 = aie.core(%tile_1_3) {
%c6 = arith.constant 6 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : bf16
%c0 = arith.constant 0 : index
%cst_0 = arith.constant 0.000000e+00 : f32
scf.for %arg0 = %c0 to %c4 step %c1 {
%0 = aie.objectfifo.acquire @obj9(Produce, 1) : !aie.objectfifosubview<memref<1024xf32, 1>>
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xf32, 1>> -> memref<1024xf32, 1>
%reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 1> to memref<1x1x8x8x4x4xf32, 1>
%2 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
%4 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_2 = memref.reinterpret_cast %5 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
scf.for %arg4 = %c0 to %c4 step %c1 {
memref.store %cst_0, %reinterpret_cast[%c0, %c0, %arg1, %arg2, %arg3, %arg4] : memref<1x1x8x8x4x4xf32, 1>
}
}
}
}
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
%10 = vector.transfer_read %reinterpret_cast_2[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%11 = vector.transfer_read %reinterpret_cast_1[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%12 = vector.transfer_read %reinterpret_cast[%c1, %c1, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [false, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %15, %reinterpret_cast[%c1, %c1, %arg2, %arg1, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
scf.for %arg1 = %c0 to %c6 step %c1 {
aie.objectfifo.release @obj3(Consume, 1)
%10 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_5 = memref.reinterpret_cast %11 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
aie.objectfifo.release @obj5(Consume, 1)
%12 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_6 = memref.reinterpret_cast %13 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c8 step %c1 {
scf.for %arg4 = %c0 to %c4 step %c1 {
%14 = vector.transfer_read %reinterpret_cast_6[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%15 = vector.transfer_read %reinterpret_cast_5[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%16 = vector.transfer_read %reinterpret_cast[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst_0 {in_bounds = [false, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%17 = arith.extf %14 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%18 = arith.extf %15 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %19, %reinterpret_cast[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
}
aie.objectfifo.release @obj3(Consume, 1)
%6 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_3 = memref.reinterpret_cast %7 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1>
aie.objectfifo.release @obj5(Consume, 1)
%8 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>>
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1>
%reinterpret_cast_4 = memref.reinterpret_cast %9 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1>
scf.for %arg1 = %c0 to %c8 step %c1 {
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
%10 = vector.transfer_read %reinterpret_cast_4[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16>
%11 = vector.transfer_read %reinterpret_cast_3[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16>
%12 = vector.transfer_read %reinterpret_cast[%c1, %c1, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [false, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32>
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32>
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32>
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32>
vector.transfer_write %15, %reinterpret_cast[%c1, %c1, %arg2, %arg1, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1>
}
}
}
aie.objectfifo.release @obj5(Consume, 1)
aie.objectfifo.release @obj3(Consume, 1)
aie.objectfifo.release @obj9(Produce, 1)
}
aie.end
}
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16xbf16xf32(%arg0: memref<16384xi32>, %arg1: memref<16384xi32>, %arg2: memref<16384xi32>) {
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 32][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 4096][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 64][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 8192][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 96][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 12288][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 128][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 16384][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 160][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 20480][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 192][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 24576][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 224][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 28672][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][0, 0, 64, 32][1, 1, 64]) {id = 0 : i64, metadata = @obj10} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj10}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 64][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 32][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 4160][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 64][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 8256][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 96][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 12352][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 128][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 16448][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 160][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 20544][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 192][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 24640][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 224][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 28736][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 64][0, 0, 64, 32][1, 1, 64]) {id = 0 : i64, metadata = @obj10} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj10}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16384][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16416][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 4096][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16448][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 8192][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16480][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 12288][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16512][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 16384][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16544][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 20480][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16576][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 24576][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16608][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 28672][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 8192][0, 0, 64, 32][1, 1, 64]) {id = 0 : i64, metadata = @obj10} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj10}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16384][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 64][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16416][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 4160][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16448][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 8256][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16480][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 12352][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16512][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 16448][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16544][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 20544][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16576][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 24640][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16608][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj0}
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 28736][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj1}
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 8256][0, 0, 64, 32][1, 1, 64]) {id = 0 : i64, metadata = @obj10} : memref<16384xi32>
aiex.npu.dma_wait {symbol = @obj10}
return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment