gysit/fusion.mlir

## fusion.mlir
// 1) initial IR.
func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = linalg.fill ins(%cst : f32) outs(%arg3 : tensor<601x321xf32>) -> tensor<601x321xf32>
  %1 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<601x513xf32>, tensor<513x321xf32>) outs(%0 : tensor<601x321xf32>) -> tensor<601x321xf32>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<321xf32>) outs(%1 : tensor<601x321xf32>) {
  ^bb0(%arg4: f32, %arg5: f32):
    %3 = arith.addf %arg5, %arg4 : f32
    linalg.yield %3 : f32
  } -> tensor<601x321xf32>
  return %2 : tensor<601x321xf32>
}

// 2) after fusion tiling and padding.
func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
  %c16 = arith.constant 16 : index
  %c12 = arith.constant 12 : index
  %c32 = arith.constant 32 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c601 = arith.constant 601 : index
  %c321 = arith.constant 321 : index
  %c0 = arith.constant 0 : index
  %c513 = arith.constant 513 : index
  %0 = linalg.init_tensor [11, 32] : tensor<11x32xf32>
  %1 = tensor.cast %0 : tensor<11x32xf32> to tensor<?x32xf32>
  %2 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %1) -> (tensor<?x32xf32>) {
    %9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
    %10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
    %11 = tensor.extract_slice %arg2[%arg4] [%10] [1] : tensor<321xf32> to tensor<?xf32>
    %12 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%10)
    %13 = tensor.pad %11 low[%c0] high[%12] {
    ^bb0(%arg6: index):
      tensor.yield %cst : f32
    } : tensor<?xf32> to tensor<32xf32>
    %14 = tensor.insert_slice %13 into %arg5[%9, 0] [1, 32] [1, 1] : tensor<32xf32> into tensor<?x32xf32>
    scf.yield %14 : tensor<?x32xf32>
  }
  %3 = linalg.init_tensor [11, 33, 16, 32] : tensor<11x33x16x32xf32>
  %4 = tensor.cast %3 : tensor<11x33x16x32xf32> to tensor<?x?x16x32xf32>
  %5 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %4) -> (tensor<?x?x16x32xf32>) {
    %9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
    %10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
    %11 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%10)
    %12 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %arg5) -> (tensor<?x?x16x32xf32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
      %14 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
      %15 = tensor.extract_slice %arg1[%arg6, %arg4] [%14, %10] [1, 1] : tensor<513x321xf32> to tensor<?x?xf32>
      %16 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%14)
      %17 = tensor.pad %15 nofold low[%c0, %c0] high[%16, %11] {
      ^bb0(%arg8: index, %arg9: index):
        tensor.yield %cst : f32
      } : tensor<?x?xf32> to tensor<16x32xf32>
      %18 = tensor.insert_slice %17 into %arg7[%9, %13, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<16x32xf32> into tensor<?x?x16x32xf32>
      scf.yield %18 : tensor<?x?x16x32xf32>
    }
    scf.yield %12 : tensor<?x?x16x32xf32>
  }
  %6 = linalg.init_tensor [33, 12, 16] : tensor<33x12x16xf32>
  %7 = tensor.cast %6 : tensor<33x12x16xf32> to tensor<?x12x16xf32>
  %8 = scf.for %arg4 = %c0 to %c601 step %c12 iter_args(%arg5 = %arg3) -> (tensor<601x321xf32>) {
    %9 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4)
    %10 = affine.apply affine_map<(d0) -> (-d0 + 12)>(%9)
    %11 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %7) -> (tensor<?x12x16xf32>) {
      %13 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
      %14 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
      %15 = tensor.extract_slice %arg0[%arg4, %arg6] [%9, %14] [1, 1] : tensor<601x513xf32> to tensor<?x?xf32>
      %16 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%14)
      %17 = tensor.pad %15 nofold low[%c0, %c0] high[%10, %16] {
      ^bb0(%arg8: index, %arg9: index):
        tensor.yield %cst : f32
      } : tensor<?x?xf32> to tensor<12x16xf32>
      %18 = tensor.insert_slice %17 into %arg7[%13, 0, 0] [1, 12, 16] [1, 1, 1] : tensor<12x16xf32> into tensor<?x12x16xf32>
      scf.yield %18 : tensor<?x12x16xf32>
    }
    %12 = scf.for %arg6 = %c0 to %c321 step %c32 iter_args(%arg7 = %arg5) -> (tensor<601x321xf32>) {
      %13 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg6)
      %14 = tensor.extract_slice %arg7[%arg4, %arg6] [%9, %13] [1, 1] : tensor<601x321xf32> to tensor<?x?xf32>
      %15 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%13)
      %16 = tensor.pad %14 low[%c0, %c0] high[%10, %15] {
      ^bb0(%arg8: index, %arg9: index):
        tensor.yield %cst : f32
      } : tensor<?x?xf32> to tensor<12x32xf32>
      %17 = linalg.fill {iree_linalg_transform.matched} ins(%cst : f32) outs(%16 : tensor<12x32xf32>) -> tensor<12x32xf32>
      %18 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg6)
      %19 = scf.for %arg8 = %c0 to %c513 step %c16 iter_args(%arg9 = %17) -> (tensor<12x32xf32>) {
        %24 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg8)
        %25 = tensor.extract_slice %11[%24, 0, 0] [1, 12, 16] [1, 1, 1] : tensor<?x12x16xf32> to tensor<12x16xf32>
        %26 = tensor.extract_slice %5[%18, %24, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<?x?x16x32xf32> to tensor<16x32xf32>
        %27 = linalg.matmul {cast = #linalg.type_fn<cast_signed>, iree_linalg_transform.matched} ins(%25, %26 : tensor<12x16xf32>, tensor<16x32xf32>) outs(%arg9 : tensor<12x32xf32>) -> tensor<12x32xf32>
        scf.yield %27 : tensor<12x32xf32>
      }
      %20 = tensor.extract_slice %2[%18, 0] [1, 32] [1, 1] : tensor<?x32xf32> to tensor<32xf32>
      %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%20 : tensor<32xf32>) outs(%19 : tensor<12x32xf32>) attrs =  {iree_linalg_transform.matched} {
      ^bb0(%arg8: f32, %arg9: f32):
        %24 = arith.addf %arg9, %arg8 : f32
        linalg.yield %24 : f32
      } -> tensor<12x32xf32>
      %22 = tensor.extract_slice %21[0, 0] [%9, %13] [1, 1] : tensor<12x32xf32> to tensor<?x?xf32>
      %23 = tensor.insert_slice %22 into %arg7[%arg4, %arg6] [%9, %13] [1, 1] : tensor<?x?xf32> into tensor<601x321xf32>
      scf.yield %23 : tensor<601x321xf32>
    }
    scf.yield %12 : tensor<601x321xf32>
  }
  return %8 : tensor<601x321xf32>
}

// 3) After vectorization.
func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
  %c16 = arith.constant 16 : index
  %c12 = arith.constant 12 : index
  %c32 = arith.constant 32 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c601 = arith.constant 601 : index
  %c321 = arith.constant 321 : index
  %c0 = arith.constant 0 : index
  %c513 = arith.constant 513 : index
  %cst_0 = arith.constant dense<0.000000e+00> : vector<12x32xf32>
  %0 = linalg.init_tensor [11, 32] : tensor<11x32xf32>
  %1 = tensor.cast %0 : tensor<11x32xf32> to tensor<?x32xf32>
  %2 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %1) -> (tensor<?x32xf32>) {
    %9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
    %10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
    %11 = tensor.extract_slice %arg2[%arg4] [%10] [1] : tensor<321xf32> to tensor<?xf32>
    %12 = vector.transfer_read %11[%c0], %cst : tensor<?xf32>, vector<32xf32>
    %13 = vector.transfer_write %12, %arg5[%9, %c0] {in_bounds = [true]} : vector<32xf32>, tensor<?x32xf32>
    scf.yield %13 : tensor<?x32xf32>
  }
  %3 = linalg.init_tensor [11, 33, 16, 32] : tensor<11x33x16x32xf32>
  %4 = tensor.cast %3 : tensor<11x33x16x32xf32> to tensor<?x?x16x32xf32>
  %5 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %4) -> (tensor<?x?x16x32xf32>) {
    %9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
    %10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
    %11 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %arg5) -> (tensor<?x?x16x32xf32>) {
      %12 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
      %13 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
      %14 = tensor.extract_slice %arg1[%arg6, %arg4] [%13, %10] [1, 1] : tensor<513x321xf32> to tensor<?x?xf32>
      %15 = vector.transfer_read %14[%c0, %c0], %cst : tensor<?x?xf32>, vector<16x32xf32>
      %16 = vector.transfer_write %15, %arg7[%9, %12, %c0, %c0] {in_bounds = [true, true]} : vector<16x32xf32>, tensor<?x?x16x32xf32>
      scf.yield %16 : tensor<?x?x16x32xf32>
    }
    scf.yield %11 : tensor<?x?x16x32xf32>
  }
  %6 = linalg.init_tensor [33, 12, 16] : tensor<33x12x16xf32>
  %7 = tensor.cast %6 : tensor<33x12x16xf32> to tensor<?x12x16xf32>
  %8 = scf.for %arg4 = %c0 to %c601 step %c12 iter_args(%arg5 = %arg3) -> (tensor<601x321xf32>) {
    %9 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4)
    %10 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %7) -> (tensor<?x12x16xf32>) {
      %12 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
      %13 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
      %14 = tensor.extract_slice %arg0[%arg4, %arg6] [%9, %13] [1, 1] : tensor<601x513xf32> to tensor<?x?xf32>
      %15 = vector.transfer_read %14[%c0, %c0], %cst : tensor<?x?xf32>, vector<12x16xf32>
      %16 = vector.transfer_write %15, %arg7[%12, %c0, %c0] {in_bounds = [true, true]} : vector<12x16xf32>, tensor<?x12x16xf32>
      scf.yield %16 : tensor<?x12x16xf32>
    }
    %11 = scf.for %arg6 = %c0 to %c321 step %c32 iter_args(%arg7 = %arg5) -> (tensor<601x321xf32>) {
      %12 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg6)
      %13 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg6)
      %14 = scf.for %arg8 = %c0 to %c513 step %c16 iter_args(%arg9 = %cst_0) -> (vector<12x32xf32>) {
        %21 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg8)
        %22 = vector.transfer_read %10[%21, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x12x16xf32>, vector<12x16xf32>
        %23 = vector.transfer_read %5[%13, %21, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x?x16x32xf32>, vector<16x32xf32>
        %24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %arg9 : vector<12x16xf32>, vector<16x32xf32> into vector<12x32xf32>
        scf.yield %24 : vector<12x32xf32>
      }
      %15 = vector.transfer_read %2[%13, %c0], %cst {in_bounds = [true]} : tensor<?x32xf32>, vector<32xf32>
      %16 = vector.broadcast %15 : vector<32xf32> to vector<12x32xf32>
      %17 = arith.addf %14, %16 : vector<12x32xf32>
      %18 = tensor.extract_slice %arg7[%arg4, %arg6] [%9, %12] [1, 1] : tensor<601x321xf32> to tensor<?x?xf32>
      %19 = vector.transfer_write %17, %18[%c0, %c0] : vector<12x32xf32>, tensor<?x?xf32>
      %20 = tensor.insert_slice %19 into %arg7[%arg4, %arg6] [%9, %12] [1, 1] : tensor<?x?xf32> into tensor<601x321xf32>
      scf.yield %20 : tensor<601x321xf32>
    }
    scf.yield %11 : tensor<601x321xf32>
  }
  return %8 : tensor<601x321xf32>
}

// 4) After bufferization.
func @matmul_bias_add(%arg0: memref<601x513xf32>, %arg1: memref<513x321xf32>, %arg2: memref<321xf32>, %arg3: memref<601x321xf32>) attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
  %c16 = arith.constant 16 : index
  %c12 = arith.constant 12 : index
  %c32 = arith.constant 32 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c601 = arith.constant 601 : index
  %c321 = arith.constant 321 : index
  %c0 = arith.constant 0 : index
  %c513 = arith.constant 513 : index
  %cst_0 = arith.constant dense<0.000000e+00> : vector<12x32xf32>
  %0 = memref.alloc() {alignment = 128 : i64} : memref<11x32xf32>
  scf.for %arg4 = %c0 to %c321 step %c32 {
    %3 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
    %4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
    %5 = memref.subview %arg2[%arg4] [%4] [1] : memref<321xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %6 = vector.transfer_read %5[%c0], %cst : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<32xf32>
    vector.transfer_write %6, %0[%3, %c0] {in_bounds = [true]} : vector<32xf32>, memref<11x32xf32>
  }
  %1 = memref.alloc() {alignment = 128 : i64} : memref<11x33x16x32xf32>
  scf.for %arg4 = %c0 to %c321 step %c32 {
    %3 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
    %4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
    scf.for %arg5 = %c0 to %c513 step %c16 {
      %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg5)
      %6 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg5)
      %7 = memref.subview %arg1[%arg5, %arg4] [%6, %4] [1, 1] : memref<513x321xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>
      %8 = vector.transfer_read %7[%c0, %c0], %cst : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>, vector<16x32xf32>
      vector.transfer_write %8, %1[%3, %5, %c0, %c0] {in_bounds = [true, true]} : vector<16x32xf32>, memref<11x33x16x32xf32>
    }
  }
  %2 = memref.alloc() {alignment = 128 : i64} : memref<33x12x16xf32>
  scf.for %arg4 = %c0 to %c601 step %c12 {
    %3 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4)
    scf.for %arg5 = %c0 to %c513 step %c16 {
      %4 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg5)
      %5 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg5)
      %6 = memref.subview %arg0[%arg4, %arg5] [%3, %5] [1, 1] : memref<601x513xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 513 + s0 + d1)>>
      %7 = vector.transfer_read %6[%c0, %c0], %cst : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 513 + s0 + d1)>>, vector<12x16xf32>
      vector.transfer_write %7, %2[%4, %c0, %c0] {in_bounds = [true, true]} : vector<12x16xf32>, memref<33x12x16xf32>
    }
    scf.for %arg5 = %c0 to %c321 step %c32 {
      %4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg5)
      %5 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg5)
      %6 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %cst_0) -> (vector<12x32xf32>) {
        %11 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
        %12 = vector.transfer_read %2[%11, %c0, %c0], %cst {in_bounds = [true, true]} : memref<33x12x16xf32>, vector<12x16xf32>
        %13 = vector.transfer_read %1[%5, %11, %c0, %c0], %cst {in_bounds = [true, true]} : memref<11x33x16x32xf32>, vector<16x32xf32>
        %14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg7 : vector<12x16xf32>, vector<16x32xf32> into vector<12x32xf32>
        scf.yield %14 : vector<12x32xf32>
      }
      %7 = vector.transfer_read %0[%5, %c0], %cst {in_bounds = [true]} : memref<11x32xf32>, vector<32xf32>
      %8 = vector.broadcast %7 : vector<32xf32> to vector<12x32xf32>
      %9 = arith.addf %6, %8 : vector<12x32xf32>
      %10 = memref.subview %arg3[%arg4, %arg5] [%3, %4] [1, 1] : memref<601x321xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>
      vector.transfer_write %9, %10[%c0, %c0] : vector<12x32xf32>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>
    }
  }
  memref.dealloc %0 : memref<11x32xf32>
  memref.dealloc %1 : memref<11x33x16x32xf32>
  memref.dealloc %2 : memref<33x12x16xf32>
  return
}
	// 1) initial IR.
	func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
	%cst = arith.constant 0.000000e+00 : f32
	%0 = linalg.fill ins(%cst : f32) outs(%arg3 : tensor<601x321xf32>) -> tensor<601x321xf32>
	%1 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<601x513xf32>, tensor<513x321xf32>) outs(%0 : tensor<601x321xf32>) -> tensor<601x321xf32>
	%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<321xf32>) outs(%1 : tensor<601x321xf32>) {
	^bb0(%arg4: f32, %arg5: f32):
	%3 = arith.addf %arg5, %arg4 : f32
	linalg.yield %3 : f32
	} -> tensor<601x321xf32>
	return %2 : tensor<601x321xf32>
	}

	// 2) after fusion tiling and padding.
	func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
	%c16 = arith.constant 16 : index
	%c12 = arith.constant 12 : index
	%c32 = arith.constant 32 : index
	%cst = arith.constant 0.000000e+00 : f32
	%c601 = arith.constant 601 : index
	%c321 = arith.constant 321 : index
	%c0 = arith.constant 0 : index
	%c513 = arith.constant 513 : index
	%0 = linalg.init_tensor [11, 32] : tensor<11x32xf32>
	%1 = tensor.cast %0 : tensor<11x32xf32> to tensor<?x32xf32>
	%2 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %1) -> (tensor<?x32xf32>) {
	%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
	%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
	%11 = tensor.extract_slice %arg2[%arg4] [%10] [1] : tensor<321xf32> to tensor<?xf32>
	%12 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%10)
	%13 = tensor.pad %11 low[%c0] high[%12] {
	^bb0(%arg6: index):
	tensor.yield %cst : f32
	} : tensor<?xf32> to tensor<32xf32>
	%14 = tensor.insert_slice %13 into %arg5[%9, 0] [1, 32] [1, 1] : tensor<32xf32> into tensor<?x32xf32>
	scf.yield %14 : tensor<?x32xf32>
	}
	%3 = linalg.init_tensor [11, 33, 16, 32] : tensor<11x33x16x32xf32>
	%4 = tensor.cast %3 : tensor<11x33x16x32xf32> to tensor<?x?x16x32xf32>
	%5 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %4) -> (tensor<?x?x16x32xf32>) {
	%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
	%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
	%11 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%10)
	%12 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %arg5) -> (tensor<?x?x16x32xf32>) {
	%13 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
	%14 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
	%15 = tensor.extract_slice %arg1[%arg6, %arg4] [%14, %10] [1, 1] : tensor<513x321xf32> to tensor<?x?xf32>
	%16 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%14)
	%17 = tensor.pad %15 nofold low[%c0, %c0] high[%16, %11] {
	^bb0(%arg8: index, %arg9: index):
	tensor.yield %cst : f32
	} : tensor<?x?xf32> to tensor<16x32xf32>
	%18 = tensor.insert_slice %17 into %arg7[%9, %13, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<16x32xf32> into tensor<?x?x16x32xf32>
	scf.yield %18 : tensor<?x?x16x32xf32>
	}
	scf.yield %12 : tensor<?x?x16x32xf32>
	}
	%6 = linalg.init_tensor [33, 12, 16] : tensor<33x12x16xf32>
	%7 = tensor.cast %6 : tensor<33x12x16xf32> to tensor<?x12x16xf32>
	%8 = scf.for %arg4 = %c0 to %c601 step %c12 iter_args(%arg5 = %arg3) -> (tensor<601x321xf32>) {
	%9 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4)
	%10 = affine.apply affine_map<(d0) -> (-d0 + 12)>(%9)
	%11 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %7) -> (tensor<?x12x16xf32>) {
	%13 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
	%14 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
	%15 = tensor.extract_slice %arg0[%arg4, %arg6] [%9, %14] [1, 1] : tensor<601x513xf32> to tensor<?x?xf32>
	%16 = affine.apply affine_map<(d0) -> (-d0 + 16)>(%14)
	%17 = tensor.pad %15 nofold low[%c0, %c0] high[%10, %16] {
	^bb0(%arg8: index, %arg9: index):
	tensor.yield %cst : f32
	} : tensor<?x?xf32> to tensor<12x16xf32>
	%18 = tensor.insert_slice %17 into %arg7[%13, 0, 0] [1, 12, 16] [1, 1, 1] : tensor<12x16xf32> into tensor<?x12x16xf32>
	scf.yield %18 : tensor<?x12x16xf32>
	}
	%12 = scf.for %arg6 = %c0 to %c321 step %c32 iter_args(%arg7 = %arg5) -> (tensor<601x321xf32>) {
	%13 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg6)
	%14 = tensor.extract_slice %arg7[%arg4, %arg6] [%9, %13] [1, 1] : tensor<601x321xf32> to tensor<?x?xf32>
	%15 = affine.apply affine_map<(d0) -> (-d0 + 32)>(%13)
	%16 = tensor.pad %14 low[%c0, %c0] high[%10, %15] {
	^bb0(%arg8: index, %arg9: index):
	tensor.yield %cst : f32
	} : tensor<?x?xf32> to tensor<12x32xf32>
	%17 = linalg.fill {iree_linalg_transform.matched} ins(%cst : f32) outs(%16 : tensor<12x32xf32>) -> tensor<12x32xf32>
	%18 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg6)
	%19 = scf.for %arg8 = %c0 to %c513 step %c16 iter_args(%arg9 = %17) -> (tensor<12x32xf32>) {
	%24 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg8)
	%25 = tensor.extract_slice %11[%24, 0, 0] [1, 12, 16] [1, 1, 1] : tensor<?x12x16xf32> to tensor<12x16xf32>
	%26 = tensor.extract_slice %5[%18, %24, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : tensor<?x?x16x32xf32> to tensor<16x32xf32>
	%27 = linalg.matmul {cast = #linalg.type_fn<cast_signed>, iree_linalg_transform.matched} ins(%25, %26 : tensor<12x16xf32>, tensor<16x32xf32>) outs(%arg9 : tensor<12x32xf32>) -> tensor<12x32xf32>
	scf.yield %27 : tensor<12x32xf32>
	}
	%20 = tensor.extract_slice %2[%18, 0] [1, 32] [1, 1] : tensor<?x32xf32> to tensor<32xf32>
	%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%20 : tensor<32xf32>) outs(%19 : tensor<12x32xf32>) attrs = {iree_linalg_transform.matched} {
	^bb0(%arg8: f32, %arg9: f32):
	%24 = arith.addf %arg9, %arg8 : f32
	linalg.yield %24 : f32
	} -> tensor<12x32xf32>
	%22 = tensor.extract_slice %21[0, 0] [%9, %13] [1, 1] : tensor<12x32xf32> to tensor<?x?xf32>
	%23 = tensor.insert_slice %22 into %arg7[%arg4, %arg6] [%9, %13] [1, 1] : tensor<?x?xf32> into tensor<601x321xf32>
	scf.yield %23 : tensor<601x321xf32>
	}
	scf.yield %12 : tensor<601x321xf32>
	}
	return %8 : tensor<601x321xf32>
	}

	// 3) After vectorization.
	func @matmul_bias_add(%arg0: tensor<601x513xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<513x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<321xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = false}, %arg3: tensor<601x321xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<601x321xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
	%c16 = arith.constant 16 : index
	%c12 = arith.constant 12 : index
	%c32 = arith.constant 32 : index
	%cst = arith.constant 0.000000e+00 : f32
	%c601 = arith.constant 601 : index
	%c321 = arith.constant 321 : index
	%c0 = arith.constant 0 : index
	%c513 = arith.constant 513 : index
	%cst_0 = arith.constant dense<0.000000e+00> : vector<12x32xf32>
	%0 = linalg.init_tensor [11, 32] : tensor<11x32xf32>
	%1 = tensor.cast %0 : tensor<11x32xf32> to tensor<?x32xf32>
	%2 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %1) -> (tensor<?x32xf32>) {
	%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
	%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
	%11 = tensor.extract_slice %arg2[%arg4] [%10] [1] : tensor<321xf32> to tensor<?xf32>
	%12 = vector.transfer_read %11[%c0], %cst : tensor<?xf32>, vector<32xf32>
	%13 = vector.transfer_write %12, %arg5[%9, %c0] {in_bounds = [true]} : vector<32xf32>, tensor<?x32xf32>
	scf.yield %13 : tensor<?x32xf32>
	}
	%3 = linalg.init_tensor [11, 33, 16, 32] : tensor<11x33x16x32xf32>
	%4 = tensor.cast %3 : tensor<11x33x16x32xf32> to tensor<?x?x16x32xf32>
	%5 = scf.for %arg4 = %c0 to %c321 step %c32 iter_args(%arg5 = %4) -> (tensor<?x?x16x32xf32>) {
	%9 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
	%10 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
	%11 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %arg5) -> (tensor<?x?x16x32xf32>) {
	%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
	%13 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
	%14 = tensor.extract_slice %arg1[%arg6, %arg4] [%13, %10] [1, 1] : tensor<513x321xf32> to tensor<?x?xf32>
	%15 = vector.transfer_read %14[%c0, %c0], %cst : tensor<?x?xf32>, vector<16x32xf32>
	%16 = vector.transfer_write %15, %arg7[%9, %12, %c0, %c0] {in_bounds = [true, true]} : vector<16x32xf32>, tensor<?x?x16x32xf32>
	scf.yield %16 : tensor<?x?x16x32xf32>
	}
	scf.yield %11 : tensor<?x?x16x32xf32>
	}
	%6 = linalg.init_tensor [33, 12, 16] : tensor<33x12x16xf32>
	%7 = tensor.cast %6 : tensor<33x12x16xf32> to tensor<?x12x16xf32>
	%8 = scf.for %arg4 = %c0 to %c601 step %c12 iter_args(%arg5 = %arg3) -> (tensor<601x321xf32>) {
	%9 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4)
	%10 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %7) -> (tensor<?x12x16xf32>) {
	%12 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
	%13 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg6)
	%14 = tensor.extract_slice %arg0[%arg4, %arg6] [%9, %13] [1, 1] : tensor<601x513xf32> to tensor<?x?xf32>
	%15 = vector.transfer_read %14[%c0, %c0], %cst : tensor<?x?xf32>, vector<12x16xf32>
	%16 = vector.transfer_write %15, %arg7[%12, %c0, %c0] {in_bounds = [true, true]} : vector<12x16xf32>, tensor<?x12x16xf32>
	scf.yield %16 : tensor<?x12x16xf32>
	}
	%11 = scf.for %arg6 = %c0 to %c321 step %c32 iter_args(%arg7 = %arg5) -> (tensor<601x321xf32>) {
	%12 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg6)
	%13 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg6)
	%14 = scf.for %arg8 = %c0 to %c513 step %c16 iter_args(%arg9 = %cst_0) -> (vector<12x32xf32>) {
	%21 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg8)
	%22 = vector.transfer_read %10[%21, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x12x16xf32>, vector<12x16xf32>
	%23 = vector.transfer_read %5[%13, %21, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x?x16x32xf32>, vector<16x32xf32>
	%24 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %22, %23, %arg9 : vector<12x16xf32>, vector<16x32xf32> into vector<12x32xf32>
	scf.yield %24 : vector<12x32xf32>
	}
	%15 = vector.transfer_read %2[%13, %c0], %cst {in_bounds = [true]} : tensor<?x32xf32>, vector<32xf32>
	%16 = vector.broadcast %15 : vector<32xf32> to vector<12x32xf32>
	%17 = arith.addf %14, %16 : vector<12x32xf32>
	%18 = tensor.extract_slice %arg7[%arg4, %arg6] [%9, %12] [1, 1] : tensor<601x321xf32> to tensor<?x?xf32>
	%19 = vector.transfer_write %17, %18[%c0, %c0] : vector<12x32xf32>, tensor<?x?xf32>
	%20 = tensor.insert_slice %19 into %arg7[%arg4, %arg6] [%9, %12] [1, 1] : tensor<?x?xf32> into tensor<601x321xf32>
	scf.yield %20 : tensor<601x321xf32>
	}
	scf.yield %11 : tensor<601x321xf32>
	}
	return %8 : tensor<601x321xf32>
	}

	// 4) After bufferization.
	func @matmul_bias_add(%arg0: memref<601x513xf32>, %arg1: memref<513x321xf32>, %arg2: memref<321xf32>, %arg3: memref<601x321xf32>) attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
	%c16 = arith.constant 16 : index
	%c12 = arith.constant 12 : index
	%c32 = arith.constant 32 : index
	%cst = arith.constant 0.000000e+00 : f32
	%c601 = arith.constant 601 : index
	%c321 = arith.constant 321 : index
	%c0 = arith.constant 0 : index
	%c513 = arith.constant 513 : index
	%cst_0 = arith.constant dense<0.000000e+00> : vector<12x32xf32>
	%0 = memref.alloc() {alignment = 128 : i64} : memref<11x32xf32>
	scf.for %arg4 = %c0 to %c321 step %c32 {
	%3 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
	%4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
	%5 = memref.subview %arg2[%arg4] [%4] [1] : memref<321xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
	%6 = vector.transfer_read %5[%c0], %cst : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, vector<32xf32>
	vector.transfer_write %6, %0[%3, %c0] {in_bounds = [true]} : vector<32xf32>, memref<11x32xf32>
	}
	%1 = memref.alloc() {alignment = 128 : i64} : memref<11x33x16x32xf32>
	scf.for %arg4 = %c0 to %c321 step %c32 {
	%3 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg4)
	%4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg4)
	scf.for %arg5 = %c0 to %c513 step %c16 {
	%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg5)
	%6 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg5)
	%7 = memref.subview %arg1[%arg5, %arg4] [%6, %4] [1, 1] : memref<513x321xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>
	%8 = vector.transfer_read %7[%c0, %c0], %cst : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>, vector<16x32xf32>
	vector.transfer_write %8, %1[%3, %5, %c0, %c0] {in_bounds = [true, true]} : vector<16x32xf32>, memref<11x33x16x32xf32>
	}
	}
	%2 = memref.alloc() {alignment = 128 : i64} : memref<33x12x16xf32>
	scf.for %arg4 = %c0 to %c601 step %c12 {
	%3 = affine.min affine_map<(d0) -> (-d0 + 601, 12)>(%arg4)
	scf.for %arg5 = %c0 to %c513 step %c16 {
	%4 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg5)
	%5 = affine.min affine_map<(d0) -> (-d0 + 513, 16)>(%arg5)
	%6 = memref.subview %arg0[%arg4, %arg5] [%3, %5] [1, 1] : memref<601x513xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 513 + s0 + d1)>>
	%7 = vector.transfer_read %6[%c0, %c0], %cst : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 513 + s0 + d1)>>, vector<12x16xf32>
	vector.transfer_write %7, %2[%4, %c0, %c0] {in_bounds = [true, true]} : vector<12x16xf32>, memref<33x12x16xf32>
	}
	scf.for %arg5 = %c0 to %c321 step %c32 {
	%4 = affine.min affine_map<(d0) -> (-d0 + 321, 32)>(%arg5)
	%5 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg5)
	%6 = scf.for %arg6 = %c0 to %c513 step %c16 iter_args(%arg7 = %cst_0) -> (vector<12x32xf32>) {
	%11 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg6)
	%12 = vector.transfer_read %2[%11, %c0, %c0], %cst {in_bounds = [true, true]} : memref<33x12x16xf32>, vector<12x16xf32>
	%13 = vector.transfer_read %1[%5, %11, %c0, %c0], %cst {in_bounds = [true, true]} : memref<11x33x16x32xf32>, vector<16x32xf32>
	%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %12, %13, %arg7 : vector<12x16xf32>, vector<16x32xf32> into vector<12x32xf32>
	scf.yield %14 : vector<12x32xf32>
	}
	%7 = vector.transfer_read %0[%5, %c0], %cst {in_bounds = [true]} : memref<11x32xf32>, vector<32xf32>
	%8 = vector.broadcast %7 : vector<32xf32> to vector<12x32xf32>
	%9 = arith.addf %6, %8 : vector<12x32xf32>
	%10 = memref.subview %arg3[%arg4, %arg5] [%3, %4] [1, 1] : memref<601x321xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>
	vector.transfer_write %9, %10[%c0, %c0] : vector<12x32xf32>, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 321 + s0 + d1)>>
	}
	}
	memref.dealloc %0 : memref<11x32xf32>
	memref.dealloc %1 : memref<11x33x16x32xf32>
	memref.dealloc %2 : memref<33x12x16xf32>
	return
	}