Created
December 10, 2021 15:13
-
-
Save gysit/d471ff1621db0250fa77ba74bca9d0f7 to your computer and use it in GitHub Desktop.
conv2d lowering with two vectorization passes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ir after Vectorize(fun_name, "", vectorize_paddings=False) | |
func @conv_2d_nhwc_hwcf_main(%arg0: tensor<8x18x17x32xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg1: tensor<3x3x32x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg2: tensor<8x16x15x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = true}) -> tensor<8x16x15x64xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { | |
%cst = arith.constant dense<0.000000e+00> : vector<8x16x15x64xf32> | |
%cst_0 = arith.constant 0.000000e+00 : f32 | |
%c1 = arith.constant 1 : index | |
%c8 = arith.constant 8 : index | |
%c32 = arith.constant 32 : index | |
%c16 = arith.constant 16 : index | |
%c15 = arith.constant 15 : index | |
%c64 = arith.constant 64 : index | |
%c3 = arith.constant 3 : index | |
%c0 = arith.constant 0 : index | |
%0 = vector.transfer_write %cst, %arg2[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<8x16x15x64xf32>, tensor<8x16x15x64xf32> | |
%1 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %0) -> (tensor<8x16x15x64xf32>) { | |
%2 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %arg4) -> (tensor<8x16x15x64xf32>) { | |
%3 = scf.for %arg7 = %c0 to %c15 step %c8 iter_args(%arg8 = %arg6) -> (tensor<8x16x15x64xf32>) { | |
%4 = affine.min affine_map<(d0) -> (8, -d0 + 15)>(%arg7) | |
%5 = affine.apply affine_map<(d0) -> (-d0 + 8)>(%4) | |
%6 = scf.for %arg9 = %c0 to %c64 step %c32 iter_args(%arg10 = %arg8) -> (tensor<8x16x15x64xf32>) { | |
%7 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %arg10) -> (tensor<8x16x15x64xf32>) { | |
%8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg11) | |
%9 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (tensor<8x16x15x64xf32>) { | |
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg7, %arg13) | |
%11 = affine.min affine_map<(d0, d1) -> (8, -d0 - d1 + 17)>(%arg7, %arg13) | |
%12 = affine.apply affine_map<(d0) -> (-d0 + 8)>(%11) | |
%13 = scf.for %arg15 = %c0 to %c32 step %c8 iter_args(%arg16 = %arg14) -> (tensor<8x16x15x64xf32>) { | |
%14 = tensor.extract_slice %arg0[%arg3, %8, %10, %arg15] [1, 1, %11, 8] [1, 1, 1, 1] : tensor<8x18x17x32xf32> to tensor<1x1x?x8xf32> | |
%15 = tensor.extract_slice %arg1[%arg11, %arg13, %arg15, %arg9] [1, 1, 8, 32] [1, 1, 1, 1] : tensor<3x3x32x64xf32> to tensor<1x1x8x32xf32> | |
%16 = tensor.extract_slice %arg16[%arg3, %arg5, %arg7, %arg9] [1, 1, %4, 32] [1, 1, 1, 1] : tensor<8x16x15x64xf32> to tensor<1x1x?x32xf32> | |
%17 = linalg.pad_tensor %14 low[%c0, %c0, %c0, %c0] high[%c0, %c0, %12, %c0] { | |
^bb0(%arg17: index, %arg18: index, %arg19: index, %arg20: index): // no predecessors | |
linalg.yield %cst_0 : f32 | |
} : tensor<1x1x?x8xf32> to tensor<1x1x8x8xf32> | |
%18 = linalg.pad_tensor %16 low[%c0, %c0, %c0, %c0] high[%c0, %c0, %5, %c0] { | |
^bb0(%arg17: index, %arg18: index, %arg19: index, %arg20: index): // no predecessors | |
linalg.yield %cst_0 : f32 | |
} : tensor<1x1x?x32xf32> to tensor<1x1x8x32xf32> | |
%19 = vector.transfer_read %17[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x8x8xf32>, vector<1x8x8xf32> | |
%20 = vector.transfer_read %15[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x8x32xf32>, vector<1x8x32xf32> | |
%21 = vector.transfer_read %18[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x8x32xf32>, vector<1x8x32xf32> | |
%22 = vector.extract %20[0] : vector<1x8x32xf32> | |
%23 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} %19, %22, %21 : vector<1x8x8xf32>, vector<8x32xf32> into vector<1x8x32xf32> | |
%24 = vector.transfer_write %23, %18[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x8x32xf32>, tensor<1x1x8x32xf32> | |
%25 = tensor.extract_slice %24[0, 0, 0, 0] [1, 1, %4, 32] [1, 1, 1, 1] : tensor<1x1x8x32xf32> to tensor<1x1x?x32xf32> | |
%26 = tensor.insert_slice %25 into %arg16[%arg3, %arg5, %arg7, %arg9] [1, 1, %4, 32] [1, 1, 1, 1] : tensor<1x1x?x32xf32> into tensor<8x16x15x64xf32> | |
scf.yield %26 : tensor<8x16x15x64xf32> | |
} | |
scf.yield %13 : tensor<8x16x15x64xf32> | |
} | |
scf.yield %9 : tensor<8x16x15x64xf32> | |
} | |
scf.yield %7 : tensor<8x16x15x64xf32> | |
} | |
scf.yield %6 : tensor<8x16x15x64xf32> | |
} | |
scf.yield %3 : tensor<8x16x15x64xf32> | |
} | |
scf.yield %2 : tensor<8x16x15x64xf32> | |
} | |
return %1 : tensor<8x16x15x64xf32> | |
} | |
// ir after Vectorize(fun_name, "", vectorize_paddings=True) | |
func @conv_2d_nhwc_hwcf_main(%arg0: tensor<8x18x17x32xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg1: tensor<3x3x32x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg2: tensor<8x16x15x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = true}) -> tensor<8x16x15x64xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { | |
%c0 = arith.constant 0 : index | |
%c3 = arith.constant 3 : index | |
%c64 = arith.constant 64 : index | |
%c15 = arith.constant 15 : index | |
%c16 = arith.constant 16 : index | |
%c32 = arith.constant 32 : index | |
%c8 = arith.constant 8 : index | |
%c1 = arith.constant 1 : index | |
%cst = arith.constant 0.000000e+00 : f32 | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<8x16x15x64xf32> | |
%0 = vector.transfer_write %cst_0, %arg2[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<8x16x15x64xf32>, tensor<8x16x15x64xf32> | |
%1 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %0) -> (tensor<8x16x15x64xf32>) { | |
%2 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %arg4) -> (tensor<8x16x15x64xf32>) { | |
%3 = scf.for %arg7 = %c0 to %c15 step %c8 iter_args(%arg8 = %arg6) -> (tensor<8x16x15x64xf32>) { | |
%4 = affine.min affine_map<(d0) -> (8, -d0 + 15)>(%arg7) | |
%5 = scf.for %arg9 = %c0 to %c64 step %c32 iter_args(%arg10 = %arg8) -> (tensor<8x16x15x64xf32>) { | |
%6 = tensor.extract_slice %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, %4, 32] [1, 1, 1, 1] : tensor<8x16x15x64xf32> to tensor<1x1x?x32xf32> | |
%7 = vector.transfer_read %6[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, false, true]} : tensor<1x1x?x32xf32>, vector<1x8x32xf32> | |
%8 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %7) -> (vector<1x8x32xf32>) { | |
%11 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg11) | |
%12 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (vector<1x8x32xf32>) { | |
%13 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg7, %arg13) | |
%14 = affine.min affine_map<(d0, d1) -> (8, -d0 - d1 + 17)>(%arg7, %arg13) | |
%15 = scf.for %arg15 = %c0 to %c32 step %c8 iter_args(%arg16 = %arg14) -> (vector<1x8x32xf32>) { | |
%16 = tensor.extract_slice %arg0[%arg3, %11, %13, %arg15] [1, 1, %14, 8] [1, 1, 1, 1] : tensor<8x18x17x32xf32> to tensor<1x1x?x8xf32> | |
%17 = tensor.extract_slice %arg1[%arg11, %arg13, %arg15, %arg9] [1, 1, 8, 32] [1, 1, 1, 1] : tensor<3x3x32x64xf32> to tensor<1x1x8x32xf32> | |
%18 = vector.transfer_read %16[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, false, true]} : tensor<1x1x?x8xf32>, vector<1x8x8xf32> | |
%19 = vector.transfer_read %17[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : tensor<1x1x8x32xf32>, vector<1x8x32xf32> | |
%20 = vector.extract %19[0] : vector<1x8x32xf32> | |
%21 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} %18, %20, %arg16 : vector<1x8x8xf32>, vector<8x32xf32> into vector<1x8x32xf32> | |
scf.yield %21 : vector<1x8x32xf32> | |
} | |
scf.yield %15 : vector<1x8x32xf32> | |
} | |
scf.yield %12 : vector<1x8x32xf32> | |
} | |
%9 = vector.transfer_write %8, %6[%c0, %c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x8x32xf32>, tensor<1x1x?x32xf32> | |
%10 = tensor.insert_slice %9 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, %4, 32] [1, 1, 1, 1] : tensor<1x1x?x32xf32> into tensor<8x16x15x64xf32> | |
scf.yield %10 : tensor<8x16x15x64xf32> | |
} | |
scf.yield %5 : tensor<8x16x15x64xf32> | |
} | |
scf.yield %3 : tensor<8x16x15x64xf32> | |
} | |
scf.yield %2 : tensor<8x16x15x64xf32> | |
} | |
return %1 : tensor<8x16x15x64xf32> | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment