Created
April 11, 2022 19:11
-
-
Save gysit/3f844c71bd8fd899a3c32965016d8ffd to your computer and use it in GitHub Desktop.
fusion lowering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// 1) initial IR. | |
func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%0 = linalg.fill ins(%cst : f32) outs(%arg3 : tensor<601x321xf32>) -> tensor<601x321xf32> | |
%1 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<601x513xf32>, tensor<513x321xf32>) outs(%0 : tensor<601x321xf32>) -> tensor<601x321xf32> | |
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<321xf32>) outs(%1 : tensor<601x321xf32>) { | |
^bb0(%arg4: f32, %arg5: f32): | |
%3 = arith.addf %arg5, %arg4 : f32 | |
linalg.yield %3 : f32 | |
} -> tensor<601x321xf32> | |
return %2 : tensor<601x321xf32> | |
} | |
// 2) after fusion tiling and padding. | |
func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { | |
%c16 = arith.constant 16 : index | |
%c12 = arith.constant 12 : index | |
%c32 = arith.constant 32 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c601 = arith.constant 601 : index | |
%c321 = arith.constant 321 : index | |
%c0 = arith.constant 0 : index | |
%c513 = arith.constant 513 : index | |
%0 = linalg.init_tensor [11, 32] : tensor<11x32xf32> | |
%1 = tensor.cast %0 : tensor<11x32xf32> to tensor<?x32xf32> | |
%2 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %1) -> (tensor<?x32xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4) | |
%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4) | |
%11 = tensor.extract_slice %arg2[%arg4] [%10] [1] : tensor<321xf32> to tensor<?xf32> | |
%12 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%10) | |
%13 = tensor.pad %11 low[%c0] high[%12] { | |
^bb0(%arg6: index): | |
tensor.yield %cst : f32 | |
} : tensor<?xf32> to tensor<32xf32> | |
%14 = tensor.insert_slice %13 into %arg5[%9, 0] [1, 32] [1, 1] : tensor<32xf32> into tensor<?x32xf32> | |
scf.yield %14 : tensor<?x32xf32> | |
} | |
%3 = linalg.init_tensor [11, 33, 16, 32] : tensor<11x33x16x32xf32> | |
%4 = tensor.cast %3 : tensor<11x33x16x32xf32> to tensor<?x?x16x32xf32> | |
%5 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %4) -> (tensor<?x?x16x32xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4) | |
%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4) | |
%11 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%10) | |
%12 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %arg5) -> (tensor<?x?x16x32xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6) | |
%14 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6) | |
%15 = tensor.extract_slice %arg1[%arg6, %arg4] [%14, %10] [1, 1] : tensor<513x321xf32> to tensor<?x?xf32> | |
%16 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%14) | |
%17 = tensor.pad %15 nofold low[%c0, %c0] high[%16, %11] { | |
^bb0(%arg8: index, %arg9: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<16x32xf32> | |
%18 = tensor.insert_slice %17 into %arg7[%9, %13, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<16x32xf32> into tensor<?x?x16x32xf32> | |
scf.yield %18 : tensor<?x?x16x32xf32> | |
} | |
scf.yield %12 : tensor<?x?x16x32xf32> | |
} | |
%6 = linalg.init_tensor [33, 12, 16] : tensor<33x12x16xf32> | |
%7 = tensor.cast %6 : tensor<33x12x16xf32> to tensor<?x12x16xf32> | |
%8 = scf.for %arg4 = %c0 to %c601 step %c12 iter_args(%arg5 = %arg3) -> (tensor<601x321xf32>) { | |
%9 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4) | |
%10 = affine.apply affine_map<(d0) -> (-d0 + 12)>(%9) | |
%11 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %7) -> (tensor<?x12x16xf32>) { | |
%13 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6) | |
%14 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6) | |
%15 = tensor.extract_slice %arg0[%arg4, %arg6] [%9, %14] [1, 1] : tensor<601x513xf32> to tensor<?x?xf32> | |
%16 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%14) | |
%17 = tensor.pad %15 nofold low[%c0, %c0] high[%10, %16] { | |
^bb0(%arg8: index, %arg9: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<12x16xf32> | |
%18 = tensor.insert_slice %17 into %arg7[%13, 0, 0] [1, 12, 16] [1, 1, 1] : tensor<12x16xf32> into tensor<?x12x16xf32> | |
scf.yield %18 : tensor<?x12x16xf32> | |
} | |
%12 = scf.for %arg6 = %c0 to %c321 step %c32 iter_args(%arg7 = %arg5) -> (tensor<601x321xf32>) { | |
%13 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg6) | |
%14 = tensor.extract_slice %arg7[%arg4, %arg6] [%9, %13] [1, 1] : tensor<601x321xf32> to tensor<?x?xf32> | |
%15 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%13) | |
%16 = tensor.pad %14 low[%c0, %c0] high[%10, %15] { | |
^bb0(%arg8: index, %arg9: index): | |
tensor.yield %cst : f32 | |
} : tensor<?x?xf32> to tensor<12x32xf32> | |
%17 = linalg.fill {iree_linalg_transform.matched} ins(%cst : f32) outs(%16 : tensor<12x32xf32>) -> tensor<12x32xf32> | |
%18 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg6) | |
%19 = scf.for %arg8 = %c0 to %c513 step %c16 iter_args(%arg9 = %17) -> (tensor<12x32xf32>) { | |
%24 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg8) | |
%25 = tensor.extract_slice %11[%24, 0, 0] [1, 12, 16] [1, 1, 1] : tensor<?x12x16xf32> to tensor<12x16xf32> | |
%26 = tensor.extract_slice %5[%18, %24, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<?x?x16x32xf32> to tensor<16x32xf32> | |
%27 = linalg.matmul {cast = #linalg.type_fn<cast_signed>, iree_linalg_transform.matched} ins(%25, %26 : tensor<12x16xf32>, tensor<16x32xf32>) outs(%arg9 : tensor<12x32xf32>) -> tensor<12x32xf32> | |
scf.yield %27 : tensor<12x32xf32> | |
} | |
%20 = tensor.extract_slice %2[%18, 0] [1, 32] [1, 1] : tensor<?x32xf32> to tensor<32xf32> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%20 : tensor<32xf32>) outs(%19 : tensor<12x32xf32>) attrs = {iree_linalg_transform.matched} { | |
^bb0(%arg8: f32, %arg9: f32): | |
%24 = arith.addf %arg9, %arg8 : f32 | |
linalg.yield %24 : f32 | |
} -> tensor<12x32xf32> | |
%22 = tensor.extract_slice %21[0, 0] [%9, %13] [1, 1] : tensor<12x32xf32> to tensor<?x?xf32> | |
%23 = tensor.insert_slice %22 into %arg7[%arg4, %arg6] [%9, %13] [1, 1] : tensor<?x?xf32> into tensor<601x321xf32> | |
scf.yield %23 : tensor<601x321xf32> | |
} | |
scf.yield %12 : tensor<601x321xf32> | |
} | |
return %8 : tensor<601x321xf32> | |
} | |
// 3) After vectorization. | |
func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { | |
%c16 = arith.constant 16 : index | |
%c12 = arith.constant 12 : index | |
%c32 = arith.constant 32 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c601 = arith.constant 601 : index | |
%c321 = arith.constant 321 : index | |
%c0 = arith.constant 0 : index | |
%c513 = arith.constant 513 : index | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<12x32xf32> | |
%0 = linalg.init_tensor [11, 32] : tensor<11x32xf32> | |
%1 = tensor.cast %0 : tensor<11x32xf32> to tensor<?x32xf32> | |
%2 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %1) -> (tensor<?x32xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4) | |
%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4) | |
%11 = tensor.extract_slice %arg2[%arg4] [%10] [1] : tensor<321xf32> to tensor<?xf32> | |
%12 = vector.transfer_read %11[%c0], %cst : tensor<?xf32>, vector<32xf32> | |
%13 = vector.transfer_write %12, %arg5[%9, %c0] {in_bounds = [true]} : vector<32xf32>, tensor<?x32xf32> | |
scf.yield %13 : tensor<?x32xf32> | |
} | |
%3 = linalg.init_tensor [11, 33, 16, 32] : tensor<11x33x16x32xf32> | |
%4 = tensor.cast %3 : tensor<11x33x16x32xf32> to tensor<?x?x16x32xf32> | |
%5 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %4) -> (tensor<?x?x16x32xf32>) { | |
%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4) | |
%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4) | |
%11 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %arg5) -> (tensor<?x?x16x32xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6) | |
%13 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6) | |
%14 = tensor.extract_slice %arg1[%arg6, %arg4] [%13, %10] [1, 1] : tensor<513x321xf32> to tensor<?x?xf32> | |
%15 = vector.transfer_read %14[%c0, %c0], %cst : tensor<?x?xf32>, vector<16x32xf32> | |
%16 = vector.transfer_write %15, %arg7[%9, %12, %c0, %c0] {in_bounds = [true, true]} : vector<16x32xf32>, tensor<?x?x16x32xf32> | |
scf.yield %16 : tensor<?x?x16x32xf32> | |
} | |
scf.yield %11 : tensor<?x?x16x32xf32> | |
} | |
%6 = linalg.init_tensor [33, 12, 16] : tensor<33x12x16xf32> | |
%7 = tensor.cast %6 : tensor<33x12x16xf32> to tensor<?x12x16xf32> | |
%8 = scf.for %arg4 = %c0 to %c601 step %c12 iter_args(%arg5 = %arg3) -> (tensor<601x321xf32>) { | |
%9 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4) | |
%10 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %7) -> (tensor<?x12x16xf32>) { | |
%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6) | |
%13 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6) | |
%14 = tensor.extract_slice %arg0[%arg4, %arg6] [%9, %13] [1, 1] : tensor<601x513xf32> to tensor<?x?xf32> | |
%15 = vector.transfer_read %14[%c0, %c0], %cst : tensor<?x?xf32>, vector<12x16xf32> | |
%16 = vector.transfer_write %15, %arg7[%12, %c0, %c0] {in_bounds = [true, true]} : vector<12x16xf32>, tensor<?x12x16xf32> | |
scf.yield %16 : tensor<?x12x16xf32> | |
} | |
%11 = scf.for %arg6 = %c0 to %c321 step %c32 iter_args(%arg7 = %arg5) -> (tensor<601x321xf32>) { | |
%12 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg6) | |
%13 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg6) | |
%14 = scf.for %arg8 = %c0 to %c513 step %c16 iter_args(%arg9 = %cst_0) -> (vector<12x32xf32>) { | |
%21 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg8) | |
%22 = vector.transfer_read %10[%21, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x12x16xf32>, vector<12x16xf32> | |
%23 = vector.transfer_read %5[%13, %21, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x?x16x32xf32>, vector<16x32xf32> | |
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %arg9 : vector<12x16xf32>, vector<16x32xf32> into vector<12x32xf32> | |
scf.yield %24 : vector<12x32xf32> | |
} | |
%15 = vector.transfer_read %2[%13, %c0], %cst {in_bounds = [true]} : tensor<?x32xf32>, vector<32xf32> | |
%16 = vector.broadcast %15 : vector<32xf32> to vector<12x32xf32> | |
%17 = arith.addf %14, %16 : vector<12x32xf32> | |
%18 = tensor.extract_slice %arg7[%arg4, %arg6] [%9, %12] [1, 1] : tensor<601x321xf32> to tensor<?x?xf32> | |
%19 = vector.transfer_write %17, %18[%c0, %c0] : vector<12x32xf32>, tensor<?x?xf32> | |
%20 = tensor.insert_slice %19 into %arg7[%arg4, %arg6] [%9, %12] [1, 1] : tensor<?x?xf32> into tensor<601x321xf32> | |
scf.yield %20 : tensor<601x321xf32> | |
} | |
scf.yield %11 : tensor<601x321xf32> | |
} | |
return %8 : tensor<601x321xf32> | |
} | |
// 4) After bufferization. | |
func @matmul_bias_add(%arg0: memref<601x513xf32>, %arg1: memref<513x321xf32>, %arg2: memref<321xf32>, %arg3: memref<601x321xf32>) attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { | |
%c16 = arith.constant 16 : index | |
%c12 = arith.constant 12 : index | |
%c32 = arith.constant 32 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c601 = arith.constant 601 : index | |
%c321 = arith.constant 321 : index | |
%c0 = arith.constant 0 : index | |
%c513 = arith.constant 513 : index | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<12x32xf32> | |
%0 = memref.alloc() {alignment = 128 : i64} : memref<11x32xf32> | |
scf.for %arg4 = %c0 to %c321 step %c32 { | |
%3 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4) | |
%4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4) | |
%5 = memref.subview %arg2[%arg4] [%4] [1] : memref<321xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>> | |
%6 = vector.transfer_read %5[%c0], %cst : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<32xf32> | |
vector.transfer_write %6, %0[%3, %c0] {in_bounds = [true]} : vector<32xf32>, memref<11x32xf32> | |
} | |
%1 = memref.alloc() {alignment = 128 : i64} : memref<11x33x16x32xf32> | |
scf.for %arg4 = %c0 to %c321 step %c32 { | |
%3 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4) | |
%4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4) | |
scf.for %arg5 = %c0 to %c513 step %c16 { | |
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg5) | |
%6 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg5) | |
%7 = memref.subview %arg1[%arg5, %arg4] [%6, %4] [1, 1] : memref<513x321xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>> | |
%8 = vector.transfer_read %7[%c0, %c0], %cst : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>, vector<16x32xf32> | |
vector.transfer_write %8, %1[%3, %5, %c0, %c0] {in_bounds = [true, true]} : vector<16x32xf32>, memref<11x33x16x32xf32> | |
} | |
} | |
%2 = memref.alloc() {alignment = 128 : i64} : memref<33x12x16xf32> | |
scf.for %arg4 = %c0 to %c601 step %c12 { | |
%3 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4) | |
scf.for %arg5 = %c0 to %c513 step %c16 { | |
%4 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg5) | |
%5 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg5) | |
%6 = memref.subview %arg0[%arg4, %arg5] [%3, %5] [1, 1] : memref<601x513xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 513 + s0 + d1)>> | |
%7 = vector.transfer_read %6[%c0, %c0], %cst : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 513 + s0 + d1)>>, vector<12x16xf32> | |
vector.transfer_write %7, %2[%4, %c0, %c0] {in_bounds = [true, true]} : vector<12x16xf32>, memref<33x12x16xf32> | |
} | |
scf.for %arg5 = %c0 to %c321 step %c32 { | |
%4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg5) | |
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg5) | |
%6 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %cst_0) -> (vector<12x32xf32>) { | |
%11 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6) | |
%12 = vector.transfer_read %2[%11, %c0, %c0], %cst {in_bounds = [true, true]} : memref<33x12x16xf32>, vector<12x16xf32> | |
%13 = vector.transfer_read %1[%5, %11, %c0, %c0], %cst {in_bounds = [true, true]} : memref<11x33x16x32xf32>, vector<16x32xf32> | |
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg7 : vector<12x16xf32>, vector<16x32xf32> into vector<12x32xf32> | |
scf.yield %14 : vector<12x32xf32> | |
} | |
%7 = vector.transfer_read %0[%5, %c0], %cst {in_bounds = [true]} : memref<11x32xf32>, vector<32xf32> | |
%8 = vector.broadcast %7 : vector<32xf32> to vector<12x32xf32> | |
%9 = arith.addf %6, %8 : vector<12x32xf32> | |
%10 = memref.subview %arg3[%arg4, %arg5] [%3, %4] [1, 1] : memref<601x321xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>> | |
vector.transfer_write %9, %10[%c0, %c0] : vector<12x32xf32>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>> | |
} | |
} | |
memref.dealloc %0 : memref<11x32xf32> | |
memref.dealloc %1 : memref<11x33x16x32xf32> | |
memref.dealloc %2 : memref<33x12x16xf32> | |
return | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment