Skip to content

Instantly share code, notes, and snippets.

@gysit
Created April 11, 2022 19:11
Show Gist options
  • Save gysit/3f844c71bd8fd899a3c32965016d8ffd to your computer and use it in GitHub Desktop.
Save gysit/3f844c71bd8fd899a3c32965016d8ffd to your computer and use it in GitHub Desktop.
fusion lowering
// 1) initial IR.
func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.fill ins(%cst : f32) outs(%arg3 : tensor<601x321xf32>) -> tensor<601x321xf32>
%1 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<601x513xf32>, tensor<513x321xf32>) outs(%0 : tensor<601x321xf32>) -> tensor<601x321xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<321xf32>) outs(%1 : tensor<601x321xf32>) {
^bb0(%arg4: f32, %arg5: f32):
%3 = arith.addf %arg5, %arg4 : f32
linalg.yield %3 : f32
} -> tensor<601x321xf32>
return %2 : tensor<601x321xf32>
}
// 2) after fusion tiling and padding.
func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%c16 = arith.constant 16 : index
%c12 = arith.constant 12 : index
%c32 = arith.constant 32 : index
%cst = arith.constant 0.000000e+00 : f32
%c601 = arith.constant 601 : index
%c321 = arith.constant 321 : index
%c0 = arith.constant 0 : index
%c513 = arith.constant 513 : index
%0 = linalg.init_tensor [11, 32] : tensor<11x32xf32>
%1 = tensor.cast %0 : tensor<11x32xf32> to tensor<?x32xf32>
%2 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %1) -> (tensor<?x32xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
%11 = tensor.extract_slice %arg2[%arg4] [%10] [1] : tensor<321xf32> to tensor<?xf32>
%12 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%10)
%13 = tensor.pad %11 low[%c0] high[%12] {
^bb0(%arg6: index):
tensor.yield %cst : f32
} : tensor<?xf32> to tensor<32xf32>
%14 = tensor.insert_slice %13 into %arg5[%9, 0] [1, 32] [1, 1] : tensor<32xf32> into tensor<?x32xf32>
scf.yield %14 : tensor<?x32xf32>
}
%3 = linalg.init_tensor [11, 33, 16, 32] : tensor<11x33x16x32xf32>
%4 = tensor.cast %3 : tensor<11x33x16x32xf32> to tensor<?x?x16x32xf32>
%5 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %4) -> (tensor<?x?x16x32xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
%11 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%10)
%12 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %arg5) -> (tensor<?x?x16x32xf32>) {
%13 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
%14 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
%15 = tensor.extract_slice %arg1[%arg6, %arg4] [%14, %10] [1, 1] : tensor<513x321xf32> to tensor<?x?xf32>
%16 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%14)
%17 = tensor.pad %15 nofold low[%c0, %c0] high[%16, %11] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<16x32xf32>
%18 = tensor.insert_slice %17 into %arg7[%9, %13, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<16x32xf32> into tensor<?x?x16x32xf32>
scf.yield %18 : tensor<?x?x16x32xf32>
}
scf.yield %12 : tensor<?x?x16x32xf32>
}
%6 = linalg.init_tensor [33, 12, 16] : tensor<33x12x16xf32>
%7 = tensor.cast %6 : tensor<33x12x16xf32> to tensor<?x12x16xf32>
%8 = scf.for %arg4 = %c0 to %c601 step %c12 iter_args(%arg5 = %arg3) -> (tensor<601x321xf32>) {
%9 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4)
%10 = affine.apply affine_map<(d0) -> (-d0 + 12)>(%9)
%11 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %7) -> (tensor<?x12x16xf32>) {
%13 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
%14 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
%15 = tensor.extract_slice %arg0[%arg4, %arg6] [%9, %14] [1, 1] : tensor<601x513xf32> to tensor<?x?xf32>
%16 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%14)
%17 = tensor.pad %15 nofold low[%c0, %c0] high[%10, %16] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<12x16xf32>
%18 = tensor.insert_slice %17 into %arg7[%13, 0, 0] [1, 12, 16] [1, 1, 1] : tensor<12x16xf32> into tensor<?x12x16xf32>
scf.yield %18 : tensor<?x12x16xf32>
}
%12 = scf.for %arg6 = %c0 to %c321 step %c32 iter_args(%arg7 = %arg5) -> (tensor<601x321xf32>) {
%13 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg6)
%14 = tensor.extract_slice %arg7[%arg4, %arg6] [%9, %13] [1, 1] : tensor<601x321xf32> to tensor<?x?xf32>
%15 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%13)
%16 = tensor.pad %14 low[%c0, %c0] high[%10, %15] {
^bb0(%arg8: index, %arg9: index):
tensor.yield %cst : f32
} : tensor<?x?xf32> to tensor<12x32xf32>
%17 = linalg.fill {iree_linalg_transform.matched} ins(%cst : f32) outs(%16 : tensor<12x32xf32>) -> tensor<12x32xf32>
%18 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg6)
%19 = scf.for %arg8 = %c0 to %c513 step %c16 iter_args(%arg9 = %17) -> (tensor<12x32xf32>) {
%24 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg8)
%25 = tensor.extract_slice %11[%24, 0, 0] [1, 12, 16] [1, 1, 1] : tensor<?x12x16xf32> to tensor<12x16xf32>
%26 = tensor.extract_slice %5[%18, %24, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<?x?x16x32xf32> to tensor<16x32xf32>
%27 = linalg.matmul {cast = #linalg.type_fn<cast_signed>, iree_linalg_transform.matched} ins(%25, %26 : tensor<12x16xf32>, tensor<16x32xf32>) outs(%arg9 : tensor<12x32xf32>) -> tensor<12x32xf32>
scf.yield %27 : tensor<12x32xf32>
}
%20 = tensor.extract_slice %2[%18, 0] [1, 32] [1, 1] : tensor<?x32xf32> to tensor<32xf32>
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%20 : tensor<32xf32>) outs(%19 : tensor<12x32xf32>) attrs = {iree_linalg_transform.matched} {
^bb0(%arg8: f32, %arg9: f32):
%24 = arith.addf %arg9, %arg8 : f32
linalg.yield %24 : f32
} -> tensor<12x32xf32>
%22 = tensor.extract_slice %21[0, 0] [%9, %13] [1, 1] : tensor<12x32xf32> to tensor<?x?xf32>
%23 = tensor.insert_slice %22 into %arg7[%arg4, %arg6] [%9, %13] [1, 1] : tensor<?x?xf32> into tensor<601x321xf32>
scf.yield %23 : tensor<601x321xf32>
}
scf.yield %12 : tensor<601x321xf32>
}
return %8 : tensor<601x321xf32>
}
// 3) After vectorization.
func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%c16 = arith.constant 16 : index
%c12 = arith.constant 12 : index
%c32 = arith.constant 32 : index
%cst = arith.constant 0.000000e+00 : f32
%c601 = arith.constant 601 : index
%c321 = arith.constant 321 : index
%c0 = arith.constant 0 : index
%c513 = arith.constant 513 : index
%cst_0 = arith.constant dense<0.000000e+00> : vector<12x32xf32>
%0 = linalg.init_tensor [11, 32] : tensor<11x32xf32>
%1 = tensor.cast %0 : tensor<11x32xf32> to tensor<?x32xf32>
%2 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %1) -> (tensor<?x32xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
%11 = tensor.extract_slice %arg2[%arg4] [%10] [1] : tensor<321xf32> to tensor<?xf32>
%12 = vector.transfer_read %11[%c0], %cst : tensor<?xf32>, vector<32xf32>
%13 = vector.transfer_write %12, %arg5[%9, %c0] {in_bounds = [true]} : vector<32xf32>, tensor<?x32xf32>
scf.yield %13 : tensor<?x32xf32>
}
%3 = linalg.init_tensor [11, 33, 16, 32] : tensor<11x33x16x32xf32>
%4 = tensor.cast %3 : tensor<11x33x16x32xf32> to tensor<?x?x16x32xf32>
%5 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %4) -> (tensor<?x?x16x32xf32>) {
%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
%11 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %arg5) -> (tensor<?x?x16x32xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
%13 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
%14 = tensor.extract_slice %arg1[%arg6, %arg4] [%13, %10] [1, 1] : tensor<513x321xf32> to tensor<?x?xf32>
%15 = vector.transfer_read %14[%c0, %c0], %cst : tensor<?x?xf32>, vector<16x32xf32>
%16 = vector.transfer_write %15, %arg7[%9, %12, %c0, %c0] {in_bounds = [true, true]} : vector<16x32xf32>, tensor<?x?x16x32xf32>
scf.yield %16 : tensor<?x?x16x32xf32>
}
scf.yield %11 : tensor<?x?x16x32xf32>
}
%6 = linalg.init_tensor [33, 12, 16] : tensor<33x12x16xf32>
%7 = tensor.cast %6 : tensor<33x12x16xf32> to tensor<?x12x16xf32>
%8 = scf.for %arg4 = %c0 to %c601 step %c12 iter_args(%arg5 = %arg3) -> (tensor<601x321xf32>) {
%9 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4)
%10 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %7) -> (tensor<?x12x16xf32>) {
%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
%13 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
%14 = tensor.extract_slice %arg0[%arg4, %arg6] [%9, %13] [1, 1] : tensor<601x513xf32> to tensor<?x?xf32>
%15 = vector.transfer_read %14[%c0, %c0], %cst : tensor<?x?xf32>, vector<12x16xf32>
%16 = vector.transfer_write %15, %arg7[%12, %c0, %c0] {in_bounds = [true, true]} : vector<12x16xf32>, tensor<?x12x16xf32>
scf.yield %16 : tensor<?x12x16xf32>
}
%11 = scf.for %arg6 = %c0 to %c321 step %c32 iter_args(%arg7 = %arg5) -> (tensor<601x321xf32>) {
%12 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg6)
%13 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg6)
%14 = scf.for %arg8 = %c0 to %c513 step %c16 iter_args(%arg9 = %cst_0) -> (vector<12x32xf32>) {
%21 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg8)
%22 = vector.transfer_read %10[%21, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x12x16xf32>, vector<12x16xf32>
%23 = vector.transfer_read %5[%13, %21, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x?x16x32xf32>, vector<16x32xf32>
%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %arg9 : vector<12x16xf32>, vector<16x32xf32> into vector<12x32xf32>
scf.yield %24 : vector<12x32xf32>
}
%15 = vector.transfer_read %2[%13, %c0], %cst {in_bounds = [true]} : tensor<?x32xf32>, vector<32xf32>
%16 = vector.broadcast %15 : vector<32xf32> to vector<12x32xf32>
%17 = arith.addf %14, %16 : vector<12x32xf32>
%18 = tensor.extract_slice %arg7[%arg4, %arg6] [%9, %12] [1, 1] : tensor<601x321xf32> to tensor<?x?xf32>
%19 = vector.transfer_write %17, %18[%c0, %c0] : vector<12x32xf32>, tensor<?x?xf32>
%20 = tensor.insert_slice %19 into %arg7[%arg4, %arg6] [%9, %12] [1, 1] : tensor<?x?xf32> into tensor<601x321xf32>
scf.yield %20 : tensor<601x321xf32>
}
scf.yield %11 : tensor<601x321xf32>
}
return %8 : tensor<601x321xf32>
}
// 4) After bufferization.
func @matmul_bias_add(%arg0: memref<601x513xf32>, %arg1: memref<513x321xf32>, %arg2: memref<321xf32>, %arg3: memref<601x321xf32>) attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%c16 = arith.constant 16 : index
%c12 = arith.constant 12 : index
%c32 = arith.constant 32 : index
%cst = arith.constant 0.000000e+00 : f32
%c601 = arith.constant 601 : index
%c321 = arith.constant 321 : index
%c0 = arith.constant 0 : index
%c513 = arith.constant 513 : index
%cst_0 = arith.constant dense<0.000000e+00> : vector<12x32xf32>
%0 = memref.alloc() {alignment = 128 : i64} : memref<11x32xf32>
scf.for %arg4 = %c0 to %c321 step %c32 {
%3 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
%4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
%5 = memref.subview %arg2[%arg4] [%4] [1] : memref<321xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
%6 = vector.transfer_read %5[%c0], %cst : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<32xf32>
vector.transfer_write %6, %0[%3, %c0] {in_bounds = [true]} : vector<32xf32>, memref<11x32xf32>
}
%1 = memref.alloc() {alignment = 128 : i64} : memref<11x33x16x32xf32>
scf.for %arg4 = %c0 to %c321 step %c32 {
%3 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
%4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
scf.for %arg5 = %c0 to %c513 step %c16 {
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg5)
%6 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg5)
%7 = memref.subview %arg1[%arg5, %arg4] [%6, %4] [1, 1] : memref<513x321xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>
%8 = vector.transfer_read %7[%c0, %c0], %cst : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>, vector<16x32xf32>
vector.transfer_write %8, %1[%3, %5, %c0, %c0] {in_bounds = [true, true]} : vector<16x32xf32>, memref<11x33x16x32xf32>
}
}
%2 = memref.alloc() {alignment = 128 : i64} : memref<33x12x16xf32>
scf.for %arg4 = %c0 to %c601 step %c12 {
%3 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4)
scf.for %arg5 = %c0 to %c513 step %c16 {
%4 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg5)
%5 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg5)
%6 = memref.subview %arg0[%arg4, %arg5] [%3, %5] [1, 1] : memref<601x513xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 513 + s0 + d1)>>
%7 = vector.transfer_read %6[%c0, %c0], %cst : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 513 + s0 + d1)>>, vector<12x16xf32>
vector.transfer_write %7, %2[%4, %c0, %c0] {in_bounds = [true, true]} : vector<12x16xf32>, memref<33x12x16xf32>
}
scf.for %arg5 = %c0 to %c321 step %c32 {
%4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg5)
%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg5)
%6 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %cst_0) -> (vector<12x32xf32>) {
%11 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
%12 = vector.transfer_read %2[%11, %c0, %c0], %cst {in_bounds = [true, true]} : memref<33x12x16xf32>, vector<12x16xf32>
%13 = vector.transfer_read %1[%5, %11, %c0, %c0], %cst {in_bounds = [true, true]} : memref<11x33x16x32xf32>, vector<16x32xf32>
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg7 : vector<12x16xf32>, vector<16x32xf32> into vector<12x32xf32>
scf.yield %14 : vector<12x32xf32>
}
%7 = vector.transfer_read %0[%5, %c0], %cst {in_bounds = [true]} : memref<11x32xf32>, vector<32xf32>
%8 = vector.broadcast %7 : vector<32xf32> to vector<12x32xf32>
%9 = arith.addf %6, %8 : vector<12x32xf32>
%10 = memref.subview %arg3[%arg4, %arg5] [%3, %4] [1, 1] : memref<601x321xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>
vector.transfer_write %9, %10[%c0, %c0] : vector<12x32xf32>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>
}
}
memref.dealloc %0 : memref<11x32xf32>
memref.dealloc %1 : memref<11x33x16x32xf32>
memref.dealloc %2 : memref<33x12x16xf32>
return
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment