-
-
Save jtuyls/0d46284d9d3c5780bd298ad7de4d88a3 to your computer and use it in GitHub Desktop.
Matmul bf16 vec with amdaie-objectFifo-stateful-transform
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
${IREE_BUILD_DIR}/tools/iree-opt %s --mlir-print-ir-before-all --amdaie-objectFifo-stateful-transform | |
aie.device(npu1_4col) { | |
%tile_0_2 = aie.tile(0, 2) | |
%tile_0_3 = aie.tile(0, 3) | |
%tile_1_2 = aie.tile(1, 2) | |
%tile_1_3 = aie.tile(1, 3) | |
%tile_0_0 = aie.tile(0, 0) | |
%tile_0_1 = aie.tile(0, 1) | |
aie.objectfifo @obj0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16, 1>> | |
aie.objectfifo @obj1(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16, 1>> | |
aie.objectfifo @obj2(%tile_0_1 toStream [<size = 8, stride = 4>, <size = 32, stride = 32>, <size = 4, stride = 1>], {%tile_0_2, %tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1>> | |
aie.objectfifo @obj3(%tile_0_1 toStream [<size = 8, stride = 4>, <size = 32, stride = 32>, <size = 4, stride = 1>], {%tile_1_2, %tile_1_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1>> | |
aie.objectfifo.link [@obj1] -> [@obj2, @obj3]() | |
aie.objectfifo @obj4(%tile_0_1 toStream [<size = 4, stride = 8>, <size = 32, stride = 32>, <size = 8, stride = 1>], {%tile_0_2, %tile_1_2}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1>> | |
aie.objectfifo @obj5(%tile_0_1 toStream [<size = 4, stride = 8>, <size = 32, stride = 32>, <size = 8, stride = 1>], {%tile_0_3, %tile_1_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16, 1>> | |
aie.objectfifo.link [@obj0] -> [@obj4, @obj5]() | |
aie.objectfifo @obj6(%tile_0_2 toStream [<size = 32, stride = 4>, <size = 8, stride = 128>, <size = 4, stride = 1>], {%tile_0_1}, 4 : i32) : !aie.objectfifo<memref<1024xf32, 1>> | |
aie.objectfifo @obj7(%tile_1_2 toStream [<size = 32, stride = 4>, <size = 8, stride = 128>, <size = 4, stride = 1>], {%tile_0_1}, 4 : i32) : !aie.objectfifo<memref<1024xf32, 1>> | |
aie.objectfifo @obj8(%tile_0_3 toStream [<size = 32, stride = 4>, <size = 8, stride = 128>, <size = 4, stride = 1>], {%tile_0_1}, 4 : i32) : !aie.objectfifo<memref<1024xf32, 1>> | |
aie.objectfifo @obj9(%tile_1_3 toStream [<size = 32, stride = 4>, <size = 8, stride = 128>, <size = 4, stride = 1>], {%tile_0_1}, 4 : i32) : !aie.objectfifo<memref<1024xf32, 1>> | |
aie.objectfifo @obj10(%tile_0_1 toStream [<size = 2, stride = 2048>, <size = 32, stride = 32>, <size = 2, stride = 1024>, <size = 32, stride = 1>], {%tile_0_0}, 4 : i32) : !aie.objectfifo<memref<4096xf32, 1>> | |
aie.objectfifo.link [@obj6, @obj7, @obj8, @obj9] -> [@obj10]() | |
%core_0_2 = aie.core(%tile_0_2) { | |
%c6 = arith.constant 6 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%c4 = arith.constant 4 : index | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
scf.for %arg0 = %c0 to %c4 step %c1 { | |
%0 = aie.objectfifo.acquire @obj6(Produce, 1) : !aie.objectfifosubview<memref<1024xf32, 1>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xf32, 1>> -> memref<1024xf32, 1> | |
%reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 1> to memref<1x1x8x8x4x4xf32, 1> | |
%2 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
%4 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_2 = memref.reinterpret_cast %5 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
memref.store %cst_0, %reinterpret_cast[%c0, %c0, %arg1, %arg2, %arg3, %arg4] : memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
} | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
%10 = vector.transfer_read %reinterpret_cast_2[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %reinterpret_cast_1[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %reinterpret_cast[%c0, %c0, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [true, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %reinterpret_cast[%c0, %c0, %arg2, %arg1, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
scf.for %arg1 = %c0 to %c6 step %c1 { | |
aie.objectfifo.release @obj2(Consume, 1) | |
%10 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_5 = memref.reinterpret_cast %11 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
aie.objectfifo.release @obj4(Consume, 1) | |
%12 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_6 = memref.reinterpret_cast %13 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%14 = vector.transfer_read %reinterpret_cast_6[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%15 = vector.transfer_read %reinterpret_cast_5[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%16 = vector.transfer_read %reinterpret_cast[%c0, %c0, %arg3, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%17 = arith.extf %14 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%18 = arith.extf %15 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %19, %reinterpret_cast[%c0, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
} | |
aie.objectfifo.release @obj2(Consume, 1) | |
%6 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_3 = memref.reinterpret_cast %7 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
aie.objectfifo.release @obj4(Consume, 1) | |
%8 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_4 = memref.reinterpret_cast %9 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
%10 = vector.transfer_read %reinterpret_cast_4[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %reinterpret_cast_3[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %reinterpret_cast[%c0, %c0, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [true, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %reinterpret_cast[%c0, %c0, %arg2, %arg1, %c0, %c0] {in_bounds = [true, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
aie.objectfifo.release @obj4(Consume, 1) | |
aie.objectfifo.release @obj2(Consume, 1) | |
aie.objectfifo.release @obj6(Produce, 1) | |
} | |
aie.end | |
} | |
%core_1_2 = aie.core(%tile_1_2) { | |
%c6 = arith.constant 6 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
scf.for %arg0 = %c0 to %c4 step %c1 { | |
%0 = aie.objectfifo.acquire @obj7(Produce, 1) : !aie.objectfifosubview<memref<1024xf32, 1>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xf32, 1>> -> memref<1024xf32, 1> | |
%reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 1> to memref<1x1x8x8x4x4xf32, 1> | |
%2 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
%4 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_2 = memref.reinterpret_cast %5 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
memref.store %cst_0, %reinterpret_cast[%c0, %c0, %arg1, %arg2, %arg3, %arg4] : memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
} | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
%10 = vector.transfer_read %reinterpret_cast_2[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %reinterpret_cast_1[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %reinterpret_cast[%c0, %c1, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [true, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %reinterpret_cast[%c0, %c1, %arg2, %arg1, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
scf.for %arg1 = %c0 to %c6 step %c1 { | |
aie.objectfifo.release @obj3(Consume, 1) | |
%10 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_5 = memref.reinterpret_cast %11 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
aie.objectfifo.release @obj4(Consume, 1) | |
%12 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_6 = memref.reinterpret_cast %13 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%14 = vector.transfer_read %reinterpret_cast_6[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%15 = vector.transfer_read %reinterpret_cast_5[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%16 = vector.transfer_read %reinterpret_cast[%c0, %c1, %arg3, %arg2, %c0, %c0], %cst_0 {in_bounds = [true, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%17 = arith.extf %14 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%18 = arith.extf %15 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %19, %reinterpret_cast[%c0, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
} | |
aie.objectfifo.release @obj3(Consume, 1) | |
%6 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_3 = memref.reinterpret_cast %7 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
aie.objectfifo.release @obj4(Consume, 1) | |
%8 = aie.objectfifo.acquire @obj4(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_4 = memref.reinterpret_cast %9 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
%10 = vector.transfer_read %reinterpret_cast_4[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %reinterpret_cast_3[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %reinterpret_cast[%c0, %c1, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [true, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %reinterpret_cast[%c0, %c1, %arg2, %arg1, %c0, %c0] {in_bounds = [true, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
aie.objectfifo.release @obj3(Consume, 1) | |
aie.objectfifo.release @obj4(Consume, 1) | |
aie.objectfifo.release @obj7(Produce, 1) | |
} | |
aie.end | |
} | |
%core_0_3 = aie.core(%tile_0_3) { | |
%c6 = arith.constant 6 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
scf.for %arg0 = %c0 to %c4 step %c1 { | |
%0 = aie.objectfifo.acquire @obj8(Produce, 1) : !aie.objectfifosubview<memref<1024xf32, 1>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xf32, 1>> -> memref<1024xf32, 1> | |
%reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 1> to memref<1x1x8x8x4x4xf32, 1> | |
%2 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
%4 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_2 = memref.reinterpret_cast %5 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
memref.store %cst_0, %reinterpret_cast[%c0, %c0, %arg1, %arg2, %arg3, %arg4] : memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
} | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
%10 = vector.transfer_read %reinterpret_cast_2[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %reinterpret_cast_1[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %reinterpret_cast[%c1, %c0, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [false, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %reinterpret_cast[%c1, %c0, %arg2, %arg1, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
scf.for %arg1 = %c0 to %c6 step %c1 { | |
aie.objectfifo.release @obj2(Consume, 1) | |
%10 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_5 = memref.reinterpret_cast %11 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
aie.objectfifo.release @obj5(Consume, 1) | |
%12 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_6 = memref.reinterpret_cast %13 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%14 = vector.transfer_read %reinterpret_cast_6[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%15 = vector.transfer_read %reinterpret_cast_5[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%16 = vector.transfer_read %reinterpret_cast[%c1, %c0, %arg3, %arg2, %c0, %c0], %cst_0 {in_bounds = [false, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%17 = arith.extf %14 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%18 = arith.extf %15 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %19, %reinterpret_cast[%c1, %c0, %arg3, %arg2, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
} | |
aie.objectfifo.release @obj2(Consume, 1) | |
%6 = aie.objectfifo.acquire @obj2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_3 = memref.reinterpret_cast %7 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
aie.objectfifo.release @obj5(Consume, 1) | |
%8 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_4 = memref.reinterpret_cast %9 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
%10 = vector.transfer_read %reinterpret_cast_4[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %reinterpret_cast_3[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %reinterpret_cast[%c1, %c0, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [false, true, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %reinterpret_cast[%c1, %c0, %arg2, %arg1, %c0, %c0] {in_bounds = [false, true, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
aie.objectfifo.release @obj5(Consume, 1) | |
aie.objectfifo.release @obj2(Consume, 1) | |
aie.objectfifo.release @obj8(Produce, 1) | |
} | |
aie.end | |
} | |
%core_1_3 = aie.core(%tile_1_3) { | |
%c6 = arith.constant 6 : index | |
%c8 = arith.constant 8 : index | |
%c4 = arith.constant 4 : index | |
%c1 = arith.constant 1 : index | |
%cst = arith.constant 0.000000e+00 : bf16 | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
scf.for %arg0 = %c0 to %c4 step %c1 { | |
%0 = aie.objectfifo.acquire @obj9(Produce, 1) : !aie.objectfifosubview<memref<1024xf32, 1>> | |
%1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xf32, 1>> -> memref<1024xf32, 1> | |
%reinterpret_cast = memref.reinterpret_cast %1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 1> to memref<1x1x8x8x4x4xf32, 1> | |
%2 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_1 = memref.reinterpret_cast %3 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
%4 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_2 = memref.reinterpret_cast %5 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
memref.store %cst_0, %reinterpret_cast[%c0, %c0, %arg1, %arg2, %arg3, %arg4] : memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
} | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
%10 = vector.transfer_read %reinterpret_cast_2[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %reinterpret_cast_1[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %reinterpret_cast[%c1, %c1, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [false, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %reinterpret_cast[%c1, %c1, %arg2, %arg1, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
scf.for %arg1 = %c0 to %c6 step %c1 { | |
aie.objectfifo.release @obj3(Consume, 1) | |
%10 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_5 = memref.reinterpret_cast %11 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
aie.objectfifo.release @obj5(Consume, 1) | |
%12 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_6 = memref.reinterpret_cast %13 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c8 step %c1 { | |
scf.for %arg4 = %c0 to %c4 step %c1 { | |
%14 = vector.transfer_read %reinterpret_cast_6[%c0, %c0, %arg4, %arg2, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%15 = vector.transfer_read %reinterpret_cast_5[%c0, %c0, %arg3, %arg4, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%16 = vector.transfer_read %reinterpret_cast[%c1, %c1, %arg3, %arg2, %c0, %c0], %cst_0 {in_bounds = [false, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%17 = arith.extf %14 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%18 = arith.extf %15 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %18, %16 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %19, %reinterpret_cast[%c1, %c1, %arg3, %arg2, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
} | |
aie.objectfifo.release @obj3(Consume, 1) | |
%6 = aie.objectfifo.acquire @obj3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_3 = memref.reinterpret_cast %7 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 1> to memref<1x1x8x4x8x4xbf16, 1> | |
aie.objectfifo.release @obj5(Consume, 1) | |
%8 = aie.objectfifo.acquire @obj5(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16, 1>> | |
%9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<1024xbf16, 1>> -> memref<1024xbf16, 1> | |
%reinterpret_cast_4 = memref.reinterpret_cast %9 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 1> to memref<1x1x4x8x4x8xbf16, 1> | |
scf.for %arg1 = %c0 to %c8 step %c1 { | |
scf.for %arg2 = %c0 to %c8 step %c1 { | |
scf.for %arg3 = %c0 to %c4 step %c1 { | |
%10 = vector.transfer_read %reinterpret_cast_4[%c0, %c0, %arg3, %arg1, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x4x8x4x8xbf16, 1>, vector<1x1x1x1x4x8xbf16> | |
%11 = vector.transfer_read %reinterpret_cast_3[%c0, %c0, %arg2, %arg3, %c0, %c0], %cst {in_bounds = [true, true, true, true, true, true]} : memref<1x1x8x4x8x4xbf16, 1>, vector<1x1x1x1x8x4xbf16> | |
%12 = vector.transfer_read %reinterpret_cast[%c1, %c1, %arg2, %arg1, %c0, %c0], %cst_0 {in_bounds = [false, false, false, false, true, true]} : memref<1x1x8x8x4x4xf32, 1>, vector<1x1x1x1x4x4xf32> | |
%13 = arith.extf %10 : vector<1x1x1x1x4x8xbf16> to vector<1x1x1x1x4x8xf32> | |
%14 = arith.extf %11 : vector<1x1x1x1x8x4xbf16> to vector<1x1x1x1x8x4xf32> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %13, %14, %12 : vector<1x1x1x1x4x8xf32>, vector<1x1x1x1x8x4xf32> into vector<1x1x1x1x4x4xf32> | |
vector.transfer_write %15, %reinterpret_cast[%c1, %c1, %arg2, %arg1, %c0, %c0] {in_bounds = [false, false, false, false, true, true]} : vector<1x1x1x1x4x4xf32>, memref<1x1x8x8x4x4xf32, 1> | |
} | |
} | |
} | |
aie.objectfifo.release @obj5(Consume, 1) | |
aie.objectfifo.release @obj3(Consume, 1) | |
aie.objectfifo.release @obj9(Produce, 1) | |
} | |
aie.end | |
} | |
func.func @matmul_i8_i32_dispatch_0_matmul_128x128x256_bf16xbf16xf32(%arg0: memref<16384xi32>, %arg1: memref<16384xi32>, %arg2: memref<16384xi32>) { | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 32][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 4096][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 64][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 8192][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 96][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 12288][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 128][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 16384][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 160][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 20480][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 192][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 24576][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 224][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 28672][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][0, 0, 64, 32][1, 1, 64]) {id = 0 : i64, metadata = @obj10} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj10} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 64][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 32][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 4160][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 64][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 8256][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 96][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 12352][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 128][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 16448][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 160][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 20544][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 192][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 24640][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 224][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 28736][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 64][0, 0, 64, 32][1, 1, 64]) {id = 0 : i64, metadata = @obj10} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj10} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16384][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16416][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 4096][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16448][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 8192][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16480][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 12288][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16512][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 16384][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16544][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 20480][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16576][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 24576][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16608][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 28672][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 8192][0, 0, 64, 32][1, 1, 64]) {id = 0 : i64, metadata = @obj10} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj10} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16384][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 64][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16416][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 4160][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16448][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 8256][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16480][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 12352][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16512][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 16448][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16544][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 20544][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16576][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 24640][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16608][0, 2, 32, 16][1, 4096, 128]) {id = 0 : i64, metadata = @obj0} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj0} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 28736][0, 2, 32, 16][1, 16, 64]) {id = 0 : i64, metadata = @obj1} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj1} | |
aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 8256][0, 0, 64, 32][1, 1, 64]) {id = 0 : i64, metadata = @obj10} : memref<16384xi32> | |
aiex.npu.dma_wait {symbol = @obj10} | |
return | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment