Skip to content

Instantly share code, notes, and snippets.

@gysit
Last active December 10, 2021 15:08
Show Gist options
  • Save gysit/b3440f4cf821e62f4c44ec44c4bd8e26 to your computer and use it in GitHub Desktop.
Save gysit/b3440f4cf821e62f4c44ec44c4bd8e26 to your computer and use it in GitHub Desktop.
conv2d lowering with single vectorization pass
// ir before running vectorization
func @conv_2d_nhwc_hwcf_main(%arg0: tensor<8x18x17x32xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg1: tensor<3x3x32x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg2: tensor<8x16x15x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = true}) -> tensor<8x16x15x64xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%c0 = arith.constant 0 : index
%c3 = arith.constant 3 : index
%c64 = arith.constant 64 : index
%c15 = arith.constant 15 : index
%c16 = arith.constant 16 : index
%c32 = arith.constant 32 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.fill(%cst, %arg2) : f32, tensor<8x16x15x64xf32> -> tensor<8x16x15x64xf32>
%1 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %0) -> (tensor<8x16x15x64xf32>) {
%2 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %arg4) -> (tensor<8x16x15x64xf32>) {
%3 = scf.for %arg7 = %c0 to %c15 step %c8 iter_args(%arg8 = %arg6) -> (tensor<8x16x15x64xf32>) {
%4 = affine.min affine_map<(d0) -> (8, -d0 + 15)>(%arg7)
%5 = affine.apply affine_map<(d0) -> (-d0 + 8)>(%4)
%6 = scf.for %arg9 = %c0 to %c64 step %c32 iter_args(%arg10 = %arg8) -> (tensor<8x16x15x64xf32>) {
%7 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %arg10) -> (tensor<8x16x15x64xf32>) {
%8 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg11)
%9 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (tensor<8x16x15x64xf32>) {
%10 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg7, %arg13)
%11 = affine.min affine_map<(d0, d1) -> (8, -d0 - d1 + 17)>(%arg7, %arg13)
%12 = affine.apply affine_map<(d0) -> (-d0 + 8)>(%11)
%13 = scf.for %arg15 = %c0 to %c32 step %c8 iter_args(%arg16 = %arg14) -> (tensor<8x16x15x64xf32>) {
%14 = tensor.extract_slice %arg0[%arg3, %8, %10, %arg15] [1, 1, %11, 8] [1, 1, 1, 1] : tensor<8x18x17x32xf32> to tensor<1x1x?x8xf32>
%15 = tensor.extract_slice %arg1[%arg11, %arg13, %arg15, %arg9] [1, 1, 8, 32] [1, 1, 1, 1] : tensor<3x3x32x64xf32> to tensor<1x1x8x32xf32>
%16 = tensor.extract_slice %arg16[%arg3, %arg5, %arg7, %arg9] [1, 1, %4, 32] [1, 1, 1, 1] : tensor<8x16x15x64xf32> to tensor<1x1x?x32xf32>
%17 = linalg.pad_tensor %14 low[%c0, %c0, %c0, %c0] high[%c0, %c0, %12, %c0] {
^bb0(%arg17: index, %arg18: index, %arg19: index, %arg20: index): // no predecessors
linalg.yield %cst : f32
} : tensor<1x1x?x8xf32> to tensor<1x1x8x8xf32>
%18 = linalg.pad_tensor %16 low[%c0, %c0, %c0, %c0] high[%c0, %c0, %5, %c0] {
^bb0(%arg17: index, %arg18: index, %arg19: index, %arg20: index): // no predecessors
linalg.yield %cst : f32
} : tensor<1x1x?x32xf32> to tensor<1x1x8x32xf32>
%19 = tensor.extract_slice %17[0, 0, 0, 0] [1, 1, 8, 8] [1, 1, 1, 1] : tensor<1x1x8x8xf32> to tensor<1x8x8xf32>
%20 = tensor.extract_slice %15[0, 0, 0, 0] [1, 1, 8, 32] [1, 1, 1, 1] : tensor<1x1x8x32xf32> to tensor<1x8x32xf32>
%21 = tensor.extract_slice %18[0, 0, 0, 0] [1, 1, 8, 32] [1, 1, 1, 1] : tensor<1x1x8x32xf32> to tensor<1x8x32xf32>
%22 = linalg.conv_1d_nwc_wcf {dilations = dense<1> : vector<1xi64>, strides = dense<1> : vector<1xi64>} ins(%19, %20 : tensor<1x8x8xf32>, tensor<1x8x32xf32>) outs(%21 : tensor<1x8x32xf32>) -> tensor<1x8x32xf32>
%23 = tensor.insert_slice %22 into %18[0, 0, 0, 0] [1, 1, 8, 32] [1, 1, 1, 1] : tensor<1x8x32xf32> into tensor<1x1x8x32xf32>
%24 = tensor.extract_slice %23[0, 0, 0, 0] [1, 1, %4, 32] [1, 1, 1, 1] : tensor<1x1x8x32xf32> to tensor<1x1x?x32xf32>
%25 = tensor.insert_slice %24 into %arg16[%arg3, %arg5, %arg7, %arg9] [1, 1, %4, 32] [1, 1, 1, 1] : tensor<1x1x?x32xf32> into tensor<8x16x15x64xf32>
scf.yield %25 : tensor<8x16x15x64xf32>
}
scf.yield %13 : tensor<8x16x15x64xf32>
}
scf.yield %9 : tensor<8x16x15x64xf32>
}
scf.yield %7 : tensor<8x16x15x64xf32>
}
scf.yield %6 : tensor<8x16x15x64xf32>
}
scf.yield %3 : tensor<8x16x15x64xf32>
}
scf.yield %2 : tensor<8x16x15x64xf32>
}
return %1 : tensor<8x16x15x64xf32>
}
// ir printed after Vectorize(fun_name, "", vectorize_paddings=True)
func @conv_2d_nhwc_hwcf_main(%arg0: tensor<8x18x17x32xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg1: tensor<3x3x32x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = false}, %arg2: tensor<8x16x15x64xf32> {linalg.buffer_layout = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, linalg.inplaceable = true}) -> tensor<8x16x15x64xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
%cst = arith.constant dense<0.000000e+00> : vector<8x16x15x64xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c15 = arith.constant 15 : index
%c64 = arith.constant 64 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%0 = vector.transfer_write %cst, %arg2[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<8x16x15x64xf32>, tensor<8x16x15x64xf32>
%1 = linalg.init_tensor [1, 1, 8, 8] : tensor<1x1x8x8xf32>
%2 = linalg.init_tensor [1, 1, 8, 32] : tensor<1x1x8x32xf32>
%3 = scf.for %arg3 = %c0 to %c8 step %c1 iter_args(%arg4 = %0) -> (tensor<8x16x15x64xf32>) {
%4 = scf.for %arg5 = %c0 to %c16 step %c1 iter_args(%arg6 = %arg4) -> (tensor<8x16x15x64xf32>) {
%5 = scf.for %arg7 = %c0 to %c15 step %c8 iter_args(%arg8 = %arg6) -> (tensor<8x16x15x64xf32>) {
%6 = affine.min affine_map<(d0) -> (8, -d0 + 15)>(%arg7)
%7 = scf.for %arg9 = %c0 to %c64 step %c32 iter_args(%arg10 = %arg8) -> (tensor<8x16x15x64xf32>) {
%8 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %arg10) -> (tensor<8x16x15x64xf32>) {
%9 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg11)
%10 = scf.for %arg13 = %c0 to %c3 step %c1 iter_args(%arg14 = %arg12) -> (tensor<8x16x15x64xf32>) {
%11 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg7, %arg13)
%12 = affine.min affine_map<(d0, d1) -> (8, -d0 - d1 + 17)>(%arg7, %arg13)
%13 = scf.for %arg15 = %c0 to %c32 step %c8 iter_args(%arg16 = %arg14) -> (tensor<8x16x15x64xf32>) {
%14 = tensor.extract_slice %arg0[%arg3, %9, %11, %arg15] [1, 1, %12, 8] [1, 1, 1, 1] : tensor<8x18x17x32xf32> to tensor<1x1x?x8xf32>
%15 = tensor.extract_slice %arg1[%arg11, %arg13, %arg15, %arg9] [1, 1, 8, 32] [1, 1, 1, 1] : tensor<3x3x32x64xf32> to tensor<1x1x8x32xf32>
%16 = tensor.extract_slice %arg16[%arg3, %arg5, %arg7, %arg9] [1, 1, %6, 32] [1, 1, 1, 1] : tensor<8x16x15x64xf32> to tensor<1x1x?x32xf32>
%17 = vector.transfer_read %14[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, false, true]} : tensor<1x1x?x8xf32>, vector<1x1x8x8xf32>
%18 = vector.transfer_write %17, %1[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x8x8xf32>, tensor<1x1x8x8xf32>
%19 = vector.transfer_read %16[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, false, true]} : tensor<1x1x?x32xf32>, vector<1x1x8x32xf32>
%20 = vector.transfer_write %19, %2[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x8x32xf32>, tensor<1x1x8x32xf32>
%21 = vector.transfer_read %18[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x8x8xf32>, vector<1x8x8xf32>
%22 = vector.transfer_read %15[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x8x32xf32>, vector<1x8x32xf32>
%23 = vector.transfer_read %20[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x8x32xf32>, vector<1x8x32xf32>
%24 = vector.extract %22[0] : vector<1x8x32xf32>
%25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} %21, %24, %23 : vector<1x8x8xf32>, vector<8x32xf32> into vector<1x8x32xf32>
%26 = vector.transfer_write %25, %20[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x8x32xf32>, tensor<1x1x8x32xf32>
%27 = tensor.extract_slice %26[0, 0, 0, 0] [1, 1, %6, 32] [1, 1, 1, 1] : tensor<1x1x8x32xf32> to tensor<1x1x?x32xf32>
%28 = tensor.insert_slice %27 into %arg16[%arg3, %arg5, %arg7, %arg9] [1, 1, %6, 32] [1, 1, 1, 1] : tensor<1x1x?x32xf32> into tensor<8x16x15x64xf32>
scf.yield %28 : tensor<8x16x15x64xf32>
}
scf.yield %13 : tensor<8x16x15x64xf32>
}
scf.yield %10 : tensor<8x16x15x64xf32>
}
scf.yield %8 : tensor<8x16x15x64xf32>
}
scf.yield %7 : tensor<8x16x15x64xf32>
}
scf.yield %5 : tensor<8x16x15x64xf32>
}
scf.yield %4 : tensor<8x16x15x64xf32>
}
return %3 : tensor<8x16x15x64xf32>
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment