antiagainst/vector-hoisting.mlir

## vector-hoisting.mlir
func.func @conv_pad_dispatch_1_conv_2d_nhwc_hwcf_1x112x112x16x3x3x3() {
  %cst = arith.constant dense<0.000000e+00> : vector<1x2x2x4xf32>
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c4 = arith.constant 4 : index
  %c112 = arith.constant 112 : index
  %c16 = arith.constant 16 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x16xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x16xf32>
  %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_count_z = hal.interface.workgroup.count[2] : index
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_z]
  scf.for %arg0 = %4 to %c112 step %5 {
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
    scf.for %arg1 = %6 to %c112 step %7 {
      %8 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
      %9 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_count_x]
      scf.for %arg2 = %8 to %c16 step %9 {
        %10 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x16xf32> -> tensor<1x4x4x16xf32>
        %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %13 = flow.dispatch.tensor.load %0, offsets = [0, %11, %12, 0], sizes = [1, 9, 9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x9x9x3xf32>
        %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
        %15 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x16xf32> -> tensor<1x4x4x16xf32>
        %16 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %10) -> (tensor<1x4x4x16xf32>) {
          %17 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<1x4x4x16xf32>) {
            %18 = scf.for %arg7 = %c0 to %c16 step %c4 iter_args(%arg8 = %arg6) -> (tensor<1x4x4x16xf32>) {
              %19 = tensor.extract_slice %15[0, %arg3, %arg5, %arg7] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x2x2x4xf32>
              %20 = tensor.extract_slice %arg8[0, %arg3, %arg5, %arg7] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x2x2x4xf32>
              %21 = vector.transfer_write %cst, %20[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf32>, tensor<1x2x2x4xf32>
              %22 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
              %23 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
              %24 = tensor.extract_slice %13[0, %22, %23, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x9x9x3xf32> to tensor<1x5x5x3xf32>
              %25 = tensor.extract_slice %14[0, 0, 0, %arg7] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x4xf32>
              %26 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %21) -> (tensor<1x2x2x4xf32>) {
                %32 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x2x2x4xf32>) {
                  %33 = tensor.extract_slice %24[0, %arg9, %arg11, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x3x3x3xf32>
                  %34 = tensor.extract_slice %25[%arg9, %arg11, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
                  %35 = scf.for %arg13 = %c0 to %c2 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x2x2x4xf32>) {
                    %36 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg13)
                    %37 = tensor.extract_slice %33[0, %36, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32>
                    %38 = tensor.extract_slice %arg14[0, %arg13, 0, 0] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf32> to tensor<1x1x2x4xf32>
                    %39 = tensor.extract_slice %37[0, 0, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x3x3xf32>
                    %40 = tensor.extract_slice %34[0, 0, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<1x1x3x4xf32> to tensor<1x3x4xf32>
                    %41 = tensor.extract_slice %38[0, 0, 0, 0] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> to tensor<1x2x4xf32>
                    %42 = vector.transfer_read %39[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x3x3xf32>, vector<1x3x3xf32>
                    %43 = vector.transfer_read %40[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x3x4xf32>, vector<1x3x4xf32>
                    %44 = vector.transfer_read %41[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x2x4xf32>, vector<1x2x4xf32>
                    %45 = vector.extract_strided_slice %42 {offsets = [0, 0, 0], sizes = [1, 1, 3], strides = [1, 1, 1]} : vector<1x3x3xf32> to vector<1x1x3xf32>
                    %46 = vector.extract_strided_slice %42 {offsets = [0, 2, 0], sizes = [1, 1, 3], strides = [1, 1, 1]} : vector<1x3x3xf32> to vector<1x1x3xf32>
                    %47 = vector.extract %43[0] : vector<1x3x4xf32>
                    %48 = vector.extract_strided_slice %44 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x2x4xf32> to vector<1x1x4xf32>
                    %49 = vector.extract_strided_slice %44 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x2x4xf32> to vector<1x1x4xf32>
                    %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %47, %48 : vector<1x1x3xf32>, vector<3x4xf32> into vector<1x1x4xf32>
                    %51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %47, %49 : vector<1x1x3xf32>, vector<3x4xf32> into vector<1x1x4xf32>
                    %52 = vector.insert_strided_slice %50, %44 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<1x2x4xf32>
                    %53 = vector.insert_strided_slice %51, %52 {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<1x2x4xf32>
                    %54 = vector.transfer_write %53, %41[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x4xf32>, tensor<1x2x4xf32>
                    %55 = tensor.insert_slice %54 into %38[0, 0, 0, 0] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x2x4xf32> into tensor<1x1x2x4xf32>
                    %56 = tensor.insert_slice %55 into %arg14[0, %arg13, 0, 0] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x2x2x4xf32>
                    scf.yield %56 : tensor<1x2x2x4xf32>
                  } {spirv.unroll}
                  scf.yield %35 : tensor<1x2x2x4xf32>
                }
                scf.yield %32 : tensor<1x2x2x4xf32>
              }
              %27 = vector.transfer_read %19[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x2x2x4xf32>, vector<1x2x2x4xf32>
              %28 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x2x2x4xf32>, vector<1x2x2x4xf32>
              %29 = arith.subf %28, %27 : vector<1x2x2x4xf32>
              %30 = vector.transfer_write %29, %26[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf32>, tensor<1x2x2x4xf32>
              %31 = tensor.insert_slice %30 into %arg8[0, %arg3, %arg5, %arg7] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf32> into tensor<1x4x4x16xf32>
              scf.yield %31 : tensor<1x4x4x16xf32>
            } {iree.spirv.distribute_dim = 0 : index}
            scf.yield %18 : tensor<1x4x4x16xf32>
          } {iree.spirv.distribute_dim = 1 : index}
          scf.yield %17 : tensor<1x4x4x16xf32>
        } {iree.spirv.distribute_dim = 2 : index}
        flow.dispatch.tensor.store %16, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
      }
    }
  }
  return
}
	func.func @conv_pad_dispatch_1_conv_2d_nhwc_hwcf_1x112x112x16x3x3x3() {
	%cst = arith.constant dense<0.000000e+00> : vector<1x2x2x4xf32>
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%c3 = arith.constant 3 : index
	%c2 = arith.constant 2 : index
	%c4 = arith.constant 4 : index
	%c112 = arith.constant 112 : index
	%c16 = arith.constant 16 : index
	%cst_0 = arith.constant 0.000000e+00 : f32
	%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
	%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x16xf32>
	%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x16xf32>
	%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
	%workgroup_id_x = hal.interface.workgroup.id[0] : index
	%workgroup_count_x = hal.interface.workgroup.count[0] : index
	%workgroup_id_y = hal.interface.workgroup.id[1] : index
	%workgroup_count_y = hal.interface.workgroup.count[1] : index
	%workgroup_id_z = hal.interface.workgroup.id[2] : index
	%workgroup_count_z = hal.interface.workgroup.count[2] : index
	%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
	%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_z]
	scf.for %arg0 = %4 to %c112 step %5 {
	%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
	%7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
	scf.for %arg1 = %6 to %c112 step %7 {
	%8 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
	%9 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_count_x]
	scf.for %arg2 = %8 to %c16 step %9 {
	%10 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x16xf32> -> tensor<1x4x4x16xf32>
	%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
	%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
	%13 = flow.dispatch.tensor.load %0, offsets = [0, %11, %12, 0], sizes = [1, 9, 9, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x9x9x3xf32>
	%14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
	%15 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x16xf32> -> tensor<1x4x4x16xf32>
	%16 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %10) -> (tensor<1x4x4x16xf32>) {
	%17 = scf.for %arg5 = %c0 to %c4 step %c2 iter_args(%arg6 = %arg4) -> (tensor<1x4x4x16xf32>) {
	%18 = scf.for %arg7 = %c0 to %c16 step %c4 iter_args(%arg8 = %arg6) -> (tensor<1x4x4x16xf32>) {
	%19 = tensor.extract_slice %15[0, %arg3, %arg5, %arg7] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x2x2x4xf32>
	%20 = tensor.extract_slice %arg8[0, %arg3, %arg5, %arg7] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x4x4x16xf32> to tensor<1x2x2x4xf32>
	%21 = vector.transfer_write %cst, %20[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf32>, tensor<1x2x2x4xf32>
	%22 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
	%23 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
	%24 = tensor.extract_slice %13[0, %22, %23, 0] [1, 5, 5, 3] [1, 1, 1, 1] : tensor<1x9x9x3xf32> to tensor<1x5x5x3xf32>
	%25 = tensor.extract_slice %14[0, 0, 0, %arg7] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x16xf32> to tensor<3x3x3x4xf32>
	%26 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %21) -> (tensor<1x2x2x4xf32>) {
	%32 = scf.for %arg11 = %c0 to %c3 step %c1 iter_args(%arg12 = %arg10) -> (tensor<1x2x2x4xf32>) {
	%33 = tensor.extract_slice %24[0, %arg9, %arg11, 0] [1, 3, 3, 3] [1, 1, 1, 1] : tensor<1x5x5x3xf32> to tensor<1x3x3x3xf32>
	%34 = tensor.extract_slice %25[%arg9, %arg11, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
	%35 = scf.for %arg13 = %c0 to %c2 step %c1 iter_args(%arg14 = %arg12) -> (tensor<1x2x2x4xf32>) {
	%36 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg13)
	%37 = tensor.extract_slice %33[0, %36, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x3x3xf32> to tensor<1x1x3x3xf32>
	%38 = tensor.extract_slice %arg14[0, %arg13, 0, 0] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf32> to tensor<1x1x2x4xf32>
	%39 = tensor.extract_slice %37[0, 0, 0, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x1x3x3xf32> to tensor<1x3x3xf32>
	%40 = tensor.extract_slice %34[0, 0, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<1x1x3x4xf32> to tensor<1x3x4xf32>
	%41 = tensor.extract_slice %38[0, 0, 0, 0] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> to tensor<1x2x4xf32>
	%42 = vector.transfer_read %39[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x3x3xf32>, vector<1x3x3xf32>
	%43 = vector.transfer_read %40[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x3x4xf32>, vector<1x3x4xf32>
	%44 = vector.transfer_read %41[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x2x4xf32>, vector<1x2x4xf32>
	%45 = vector.extract_strided_slice %42 {offsets = [0, 0, 0], sizes = [1, 1, 3], strides = [1, 1, 1]} : vector<1x3x3xf32> to vector<1x1x3xf32>
	%46 = vector.extract_strided_slice %42 {offsets = [0, 2, 0], sizes = [1, 1, 3], strides = [1, 1, 1]} : vector<1x3x3xf32> to vector<1x1x3xf32>
	%47 = vector.extract %43[0] : vector<1x3x4xf32>
	%48 = vector.extract_strided_slice %44 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x2x4xf32> to vector<1x1x4xf32>
	%49 = vector.extract_strided_slice %44 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x2x4xf32> to vector<1x1x4xf32>
	%50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %45, %47, %48 : vector<1x1x3xf32>, vector<3x4xf32> into vector<1x1x4xf32>
	%51 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %47, %49 : vector<1x1x3xf32>, vector<3x4xf32> into vector<1x1x4xf32>
	%52 = vector.insert_strided_slice %50, %44 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<1x2x4xf32>
	%53 = vector.insert_strided_slice %51, %52 {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<1x2x4xf32>
	%54 = vector.transfer_write %53, %41[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x2x4xf32>, tensor<1x2x4xf32>
	%55 = tensor.insert_slice %54 into %38[0, 0, 0, 0] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x2x4xf32> into tensor<1x1x2x4xf32>
	%56 = tensor.insert_slice %55 into %arg14[0, %arg13, 0, 0] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x2x2x4xf32>
	scf.yield %56 : tensor<1x2x2x4xf32>
	} {spirv.unroll}
	scf.yield %35 : tensor<1x2x2x4xf32>
	}
	scf.yield %32 : tensor<1x2x2x4xf32>
	}
	%27 = vector.transfer_read %19[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x2x2x4xf32>, vector<1x2x2x4xf32>
	%28 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true, true]} : tensor<1x2x2x4xf32>, vector<1x2x2x4xf32>
	%29 = arith.subf %28, %27 : vector<1x2x2x4xf32>
	%30 = vector.transfer_write %29, %26[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x2x2x4xf32>, tensor<1x2x2x4xf32>
	%31 = tensor.insert_slice %30 into %arg8[0, %arg3, %arg5, %arg7] [1, 2, 2, 4] [1, 1, 1, 1] : tensor<1x2x2x4xf32> into tensor<1x4x4x16xf32>
	scf.yield %31 : tensor<1x4x4x16xf32>
	} {iree.spirv.distribute_dim = 0 : index}
	scf.yield %18 : tensor<1x4x4x16xf32>
	} {iree.spirv.distribute_dim = 1 : index}
	scf.yield %17 : tensor<1x4x4x16xf32>
	} {iree.spirv.distribute_dim = 2 : index}
	flow.dispatch.tensor.store %16, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
	}
	}
	}
	return
	}