Skip to content

Instantly share code, notes, and snippets.

@antiagainst
Created September 7, 2022 04:07
Show Gist options
  • Save antiagainst/dbdb1535c5cf0972ff50768f5579b0d2 to your computer and use it in GitHub Desktop.
Save antiagainst/dbdb1535c5cf0972ff50768f5579b0d2 to your computer and use it in GitHub Desktop.
// tools/iree-compile --iree-input-type=mhlo --iree-hal-target-backends=vulkan-spirv --iree-vulkan-target-triple=valhall-unknown-android31 --iree-flow-enable-fuse-padding-into-consumer-ops ~/models/mhlo-conv.mlir -o /dev/null --mlir-print-ir-after-all --mlir-print-ir-after-change --mlir-disable-threading --mlir-elide-elementsattrs-if-larger=8 -debug-only=iree-spirv-vectorize &>! mhlo-conv.log
// iree-org/iree@a8e4c38c
// -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
#device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, #spv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}>], legacy_sync}>
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, #spv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>]>]>
module attributes {hal.device.targets = [#device_target_vulkan]} {
hal.executable private @conv_pad_dispatch_0 {
hal.executable.variant public @vulkan_spirv_fb, target = #executable_target_vulkan_spirv_fb {
hal.executable.export public @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
%x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2, %arg3, %arg4
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%7 = tensor.pad %4 low[0, 0, 0, 0] high[0, 1, 1, 0] {
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
tensor.yield %cst : f32
} : tensor<1x224x224x3xf32> to tensor<1x225x225x3xf32>
%8 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%10 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%7, %5 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%9 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %6 : tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) outs(%8 : tensor<1x112x112x32xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
%12 = arith.subf %arg0, %arg1 : f32
linalg.yield %12 : f32
} -> tensor<1x112x112x32xf32>
flow.dispatch.tensor.store %11, %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
return
}
}
}
}
func.func @conv_pad(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c0 = arith.constant 0 : index
%c602112 = arith.constant 602112 : index
%c3456 = arith.constant 3456 : index
%c1605632 = arith.constant 1605632 : index
%c1 = arith.constant 1 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%c553648160_i32 = arith.constant 553648160 : i32
%c1_i32 = arith.constant 1 : i32
%c224 = arith.constant 224 : index
%c3 = arith.constant 3 : index
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c1, %c224, %c224, %c3]) type(%c553648160_i32) encoding(%c1_i32)
%0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1x224x224x3xf32> in !stream.resource<external>{%c602112}
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c3, %c3, %c3, %c32]) type(%c553648160_i32) encoding(%c1_i32)
%1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<3x3x3x32xf32> in !stream.resource<external>{%c3456}
hal.buffer_view.assert<%arg2 : !hal.buffer_view> message("tensor") shape([%c1, %c112, %c112, %c32]) type(%c553648160_i32) encoding(%c1_i32)
%2 = stream.tensor.import %arg2 : !hal.buffer_view -> tensor<1x112x112x32xf32> in !stream.resource<external>{%c1605632}
%3 = stream.resource.alloc uninitialized : !stream.resource<external>{%c1605632}
%4 = stream.cmd.execute with(%0 as %arg3: !stream.resource<external>{%c602112}, %1 as %arg4: !stream.resource<external>{%c3456}, %2 as %arg5: !stream.resource<external>{%c1605632}, %3 as %arg6: !stream.resource<external>{%c1605632}) {
stream.cmd.dispatch @conv_pad_dispatch_0::@conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3[%c1, %c112, %c112, %c32] {
ro %arg3[%c0 for %c602112] : !stream.resource<external>{%c602112},
ro %arg4[%c0 for %c3456] : !stream.resource<external>{%c3456},
ro %arg5[%c0 for %c1605632] : !stream.resource<external>{%c1605632},
wo %arg6[%c0 for %c1605632] : !stream.resource<external>{%c1605632}
} attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]}
} => !stream.timepoint
%5 = stream.timepoint.await %4 => %3 : !stream.resource<external>{%c1605632}
%6 = stream.tensor.export %5 : tensor<1x112x112x32xf32> in !stream.resource<external>{%c1605632} -> !hal.buffer_view
return %6 : !hal.buffer_view
}
}
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, #spv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16, cooperative_matrix_properties_nv = []>>}> {
hal.executable.export public @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<SPIRVVectorize workload_per_wg = [32, 4, 1]>, workgroup_size = [8 : index, 2 : index, 1 : index]} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%arg3]
%1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%arg4]
hal.return %1, %0, %arg2 : index, index, index
}
builtin.module {
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%5 = tensor.pad %4 low[0, 0, 0, 0] high[0, 1, 1, 0] {
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
tensor.yield %cst : f32
} : tensor<1x224x224x3xf32> to tensor<1x225x225x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%9 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %8 to %c32 step %9 {
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = tensor.extract_slice %5[0, %10, %11, 0] [1, 3, 9, 3] [1, 1, 1, 1] : tensor<1x225x225x3xf32> to tensor<1x3x9x3xf32>
%13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %c32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%14 = linalg.init_tensor [1, 1, 4, 32] : tensor<1x1x4x32xf32>
%15 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%14 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%16 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x3x9x3xf32>, tensor<3x3x3x?xf32>) outs(%15 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%17 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, %c4, %c32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x?x?xf32>
%18 = tensor.cast %17 : tensor<1x1x?x?xf32> to tensor<1x1x4x32xf32>
%19 = linalg.init_tensor [1, 1, 4, 32] : tensor<1x1x4x32xf32>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16, %18 : tensor<1x1x4x32xf32>, tensor<1x1x4x32xf32>) outs(%19 : tensor<1x1x4x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%22 = arith.subf %arg3, %arg4 : f32
linalg.yield %22 : f32
} -> tensor<1x1x4x32xf32>
%21 = tensor.cast %20 : tensor<1x1x4x32xf32> to tensor<1x1x?x?xf32>
flow.dispatch.tensor.store %21, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, %c4, %c32], strides = [1, 1, 1, 1] : tensor<1x1x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
// -----// IR Dump After SPIRVFuseTensorPadWithConsumer (iree-spirv-fuse-tensor-pad-with-consumer) //----- //
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%c3 = arith.constant 3 : index
%c9 = arith.constant 9 : index
%c224 = arith.constant 224 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%11 = affine.min affine_map<(d0, d1) -> (d0, 224)>(%9, %c224)
%12 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%9, %c3)
%13 = affine.min affine_map<(d0, d1) -> (d0, 224)>(%12, %c224)
%14 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%13, %11)
%15 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%c3, %14)
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %c0)
%17 = affine.min affine_map<(d0, d1) -> (d0, 224)>(%10, %c224)
%18 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%10, %c9)
%19 = affine.min affine_map<(d0, d1) -> (d0, 224)>(%18, %c224)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%19, %17)
%21 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%c9, %20)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %c0)
%23 = tensor.extract_slice %4[0, %11, %17, 0] [1, %14, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%24 = tensor.pad %23 low[0, 0, 0, 0] high[0, %16, %22, 0] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%25 = tensor.cast %24 : tensor<1x?x?x3xf32> to tensor<1x3x9x3xf32>
%26 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %c32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%27 = linalg.init_tensor [1, 1, 4, 32] : tensor<1x1x4x32xf32>
%28 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%27 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%29 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%25, %26 : tensor<1x3x9x3xf32>, tensor<3x3x3x?xf32>) outs(%28 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%30 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, %c4, %c32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x?x?xf32>
%31 = tensor.cast %30 : tensor<1x1x?x?xf32> to tensor<1x1x4x32xf32>
%32 = linalg.init_tensor [1, 1, 4, 32] : tensor<1x1x4x32xf32>
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %31 : tensor<1x1x4x32xf32>, tensor<1x1x4x32xf32>) outs(%32 : tensor<1x1x4x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
%35 = arith.subf %arg3, %arg4 : f32
linalg.yield %35 : f32
} -> tensor<1x1x4x32xf32>
%34 = tensor.cast %33 : tensor<1x1x4x32xf32> to tensor<1x1x?x?xf32>
flow.dispatch.tensor.store %34, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, %c4, %c32], strides = [1, 1, 1, 1] : tensor<1x1x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%c3 = arith.constant 3 : index
%c9 = arith.constant 9 : index
%c224 = arith.constant 224 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%c4_0 = arith.constant 4 : index
%c32_1 = arith.constant 32 : index
%9 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, %c4_0, %c32_1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x?x?xf32>
%10 = tensor.cast %9 : tensor<1x1x?x?xf32> to tensor<1x1x4x32xf32>
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%13 = affine.min affine_map<(d0, d1) -> (d0, 224)>(%11, %c224)
%14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%11, %c3)
%15 = affine.min affine_map<(d0, d1) -> (d0, 224)>(%14, %c224)
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %13)
%17 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%c3, %16)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%17, %c0)
%19 = affine.min affine_map<(d0, d1) -> (d0, 224)>(%12, %c224)
%20 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%12, %c9)
%21 = affine.min affine_map<(d0, d1) -> (d0, 224)>(%20, %c224)
%22 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%21, %19)
%23 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%c9, %22)
%24 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%23, %c0)
%25 = tensor.extract_slice %4[0, %13, %19, 0] [1, %16, %22, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%26 = tensor.pad %25 low[0, 0, 0, 0] high[0, %18, %24, 0] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%27 = tensor.cast %26 : tensor<1x?x?x3xf32> to tensor<1x3x9x3xf32>
%28 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, %c32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%29 = linalg.init_tensor [1, 1, 4, 32] : tensor<1x1x4x32xf32>
%30 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%10 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%31 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%27, %28 : tensor<1x3x9x3xf32>, tensor<3x3x3x?xf32>) outs(%30 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%32 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, %c4, %c32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x?x?xf32>
%33 = tensor.cast %32 : tensor<1x1x?x?xf32> to tensor<1x1x4x32xf32>
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<1x1x4x32xf32>) outs(%31 : tensor<1x1x4x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg3: f32, %arg4: f32):
%36 = arith.subf %arg4, %arg3 : f32
linalg.yield %36 : f32
} -> tensor<1x1x4x32xf32>
%35 = tensor.cast %34 : tensor<1x1x4x32xf32> to tensor<1x1x?x?xf32>
flow.dispatch.tensor.store %35, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, %c4, %c32], strides = [1, 1, 1, 1] : tensor<1x1x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%10 = affine.min affine_map<(d0) -> (224, d0 * 2)>(%arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %10)
%13 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 + 3)>(%11, %10)
%14 = affine.min affine_map<(d0) -> (224, d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %14)
%17 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 + 9)>(%15, %14)
%18 = tensor.extract_slice %4[0, %10, %14, 0] [1, %12, %16, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%19 = tensor.pad %18 low[0, 0, 0, 0] high[0, %13, %17, 0] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<1x?x?x3xf32> to tensor<1x3x9x3xf32>
%20 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%21 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%9 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%22 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%19, %20 : tensor<1x3x9x3xf32>, tensor<3x3x3x32xf32>) outs(%21 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x4x32xf32>) outs(%22 : tensor<1x1x4x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg3: f32, %arg4: f32):
%25 = arith.subf %arg4, %arg3 : f32
linalg.yield %25 : f32
} -> tensor<1x1x4x32xf32>
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) //----- //
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%11, %10)
%13 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 + 3)>(%11, %10)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%15, %14)
%17 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 + 9)>(%15, %14)
%18 = tensor.extract_slice %4[0, %10, %14, 0] [1, %12, %16, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%19 = tensor.pad %18 low[0, 0, 0, 0] high[0, %13, %17, 0] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<1x?x?x3xf32> to tensor<1x3x9x3xf32>
%20 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%21 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%9 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%22 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%19, %20 : tensor<1x3x9x3xf32>, tensor<3x3x3x32xf32>) outs(%21 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x4x32xf32>) outs(%22 : tensor<1x1x4x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg3: f32, %arg4: f32):
%25 = arith.subf %arg4, %arg3 : f32
linalg.yield %25 : f32
} -> tensor<1x1x4x32xf32>
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module {
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%12 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg0)
%13 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%11, %arg0)
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%16 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%15, %arg1)
%17 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%15, %arg1)
%18 = tensor.extract_slice %4[0, %10, %14, 0] [1, %12, %16, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%19 = tensor.pad %18 low[0, 0, 0, 0] high[0, %13, %17, 0] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<1x?x?x3xf32> to tensor<1x3x9x3xf32>
%20 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%21 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%9 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%22 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%19, %20 : tensor<1x3x9x3xf32>, tensor<3x3x3x32xf32>) outs(%21 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%23 : tensor<1x1x4x32xf32>) outs(%22 : tensor<1x1x4x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg3: f32, %arg4: f32):
%25 = arith.subf %arg4, %arg3 : f32
linalg.yield %25 : f32
} -> tensor<1x1x4x32xf32>
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
// -----// IR Dump After SPIRVCreateFastSlowPath (iree-spirv-create-fast-slow-path) //----- //
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%12 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%11, %arg1)
%13 = arith.cmpi eq, %10, %c0 : index
%14 = arith.cmpi eq, %12, %c0 : index
%15 = arith.andi %13, %14 : i1
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%16 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%24 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%21, %22 : tensor<1x?x?x3xf32>, tensor<3x3x3x32xf32>) outs(%23 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%25 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25 : tensor<1x1x4x32xf32>) outs(%24 : tensor<1x1x4x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg3: f32, %arg4: f32):
%27 = arith.subf %arg4, %arg3 : f32
linalg.yield %27 : f32
} -> tensor<1x1x4x32xf32>
flow.dispatch.tensor.store %26, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = tensor.pad %21 low[0, 0, 0, 0] high[0, %10, %12, 0] {
^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
tensor.yield %cst : f32
} : tensor<1x?x?x3xf32> to tensor<1x3x9x3xf32>
%23 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%24 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%16 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%25 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%22, %23 : tensor<1x3x9x3xf32>, tensor<3x3x3x32xf32>) outs(%24 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%26 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%26 : tensor<1x1x4x32xf32>) outs(%25 : tensor<1x1x4x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg3: f32, %arg4: f32):
%28 = arith.subf %arg4, %arg3 : f32
linalg.yield %28 : f32
} -> tensor<1x1x4x32xf32>
flow.dispatch.tensor.store %27, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
// -----// IR Dump After SPIRVTile (iree-spirv-tile) //----- //
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%12 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%11, %arg1)
%13 = arith.cmpi eq, %10, %c0 : index
%14 = arith.cmpi eq, %12, %c0 : index
%15 = arith.andi %13, %14 : i1
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%27 : tensor<1x1x2x4xf32>) -> tensor<1x1x2x4xf32>
%29 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%30 = tensor.extract_slice %21[0, 0, %29, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%31 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%32 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %28) -> (tensor<1x1x2x4xf32>) {
%35 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%36 = tensor.extract_slice %30[0, %arg7, %arg9, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%37 = tensor.extract_slice %31[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%38 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%36, %37 : tensor<1x1x3x3xf32>, tensor<1x1x3x4xf32>) outs(%arg10 : tensor<1x1x2x4xf32>) -> tensor<1x1x2x4xf32>
scf.yield %38 : tensor<1x1x2x4xf32>
}
scf.yield %35 : tensor<1x1x2x4xf32>
}
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%26 : tensor<1x1x2x4xf32>) outs(%32 : tensor<1x1x2x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg7: f32, %arg8: f32):
%35 = arith.subf %arg8, %arg7 : f32
linalg.yield %35 : f32
} -> tensor<1x1x2x4xf32>
%34 = tensor.insert_slice %33 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %34 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%27 : tensor<1x1x2x4xf32>) -> tensor<1x1x2x4xf32>
%29 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%30 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%31 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%30, %29)
%32 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %11, %arg1)
%33 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%11, %arg1, %arg3)
%34 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%33, %32)
%35 = tensor.extract_slice %21[0, %29, %32, 0] [1, %31, %34, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%36 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%37 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %28) -> (tensor<1x1x2x4xf32>) {
%40 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%41 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %30, %29)
%42 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %30, %29)
%43 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%42, %41)
%44 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 + 1)>(%42, %41)
%45 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg9, %33, %32)
%46 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg9, %33, %32)
%47 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%46, %45)
%48 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 + 3)>(%46, %45)
%49 = tensor.extract_slice %35[0, %41, %45, 0] [1, %43, %47, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%50 = tensor.pad %49 low[0, 0, 0, 0] high[0, %44, %48, 0] {
^bb0(%arg11: index, %arg12: index, %arg13: index, %arg14: index):
tensor.yield %cst : f32
} : tensor<1x?x?x3xf32> to tensor<1x1x3x3xf32>
%51 = tensor.extract_slice %36[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%52 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%50, %51 : tensor<1x1x3x3xf32>, tensor<1x1x3x4xf32>) outs(%arg10 : tensor<1x1x2x4xf32>) -> tensor<1x1x2x4xf32>
scf.yield %52 : tensor<1x1x2x4xf32>
}
scf.yield %40 : tensor<1x1x2x4xf32>
}
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%26 : tensor<1x1x2x4xf32>) outs(%37 : tensor<1x1x2x4xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg7: f32, %arg8: f32):
%40 = arith.subf %arg8, %arg7 : f32
linalg.yield %40 : f32
} -> tensor<1x1x2x4xf32>
%39 = tensor.insert_slice %38 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %39 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
--- After vectorization ---
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x2x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<3x3xf32>
%cst_1 = arith.constant dense<0.000000e+00> : vector<3xf32>
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%12 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%11, %arg1)
%13 = arith.cmpi eq, %10, %c0 : index
%14 = arith.cmpi eq, %12, %c0 : index
%15 = arith.andi %13, %14 : i1
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.transfer_write %cst, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%29 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%30 = tensor.extract_slice %21[0, 0, %29, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%31 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%32 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %28) -> (tensor<1x1x2x4xf32>) {
%38 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%39 = tensor.extract_slice %30[0, %arg7, %arg9, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%40 = tensor.extract_slice %31[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%41 = vector.transfer_read %40[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<3x4xf32>
%42 = vector.extract_strided_slice %41 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%43 = vector.extract_strided_slice %41 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%44 = vector.extract_strided_slice %41 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%45 = vector.transfer_read %39[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%46 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%47 = vector.extract_strided_slice %45 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %42, %46 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%49 = vector.extract_strided_slice %45 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %43, %48 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%51 = vector.extract_strided_slice %45 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %44, %50 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%53 = vector.transfer_write %52, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
%54 = vector.transfer_read %39[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%55 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%56 = vector.extract_strided_slice %54 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %56, %42, %55 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%58 = vector.extract_strided_slice %54 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %43, %57 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%60 = vector.extract_strided_slice %54 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %44, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%62 = vector.transfer_write %61, %53[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
scf.yield %62 : tensor<1x1x2x4xf32>
}
scf.yield %38 : tensor<1x1x2x4xf32>
}
%33 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%34 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%35 = arith.subf %34, %33 : vector<1x1x2x4xf32>
%36 = vector.transfer_write %35, %32[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%37 = tensor.insert_slice %36 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %37 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.transfer_write %cst, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%29 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%30 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%31 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%30, %29)
%32 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %11, %arg1)
%33 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%11, %arg1, %arg3)
%34 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%33, %32)
%35 = tensor.extract_slice %21[0, %29, %32, 0] [1, %31, %34, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%36 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%37 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %28) -> (tensor<1x1x2x4xf32>) {
%43 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%44 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %30, %29)
%45 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %30, %29)
%46 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%45, %44)
%47 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg9, %33, %32)
%48 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg9, %33, %32)
%49 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%48, %47)
%50 = tensor.extract_slice %35[0, %44, %47, 0] [1, %46, %49, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %46]
%52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %49]
%53 = arith.cmpi sgt, %51, %c0 : index
%54 = arith.cmpi sgt, %52, %c0 : index
%55 = arith.andi %53, %54 : i1
%56 = scf.if %55 -> (vector<3xf32>) {
%93 = vector.transfer_read %50[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<3xf32>
scf.yield %93 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%57 = vector.insert_strided_slice %56, %cst_0 {offsets = [0, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%58 = arith.cmpi sgt, %51, %c0 : index
%59 = arith.cmpi sgt, %52, %c1 : index
%60 = arith.andi %58, %59 : i1
%61 = scf.if %60 -> (vector<3xf32>) {
%93 = vector.transfer_read %50[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<3xf32>
scf.yield %93 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%62 = vector.insert_strided_slice %61, %57 {offsets = [1, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%63 = arith.cmpi sgt, %51, %c0 : index
%64 = arith.cmpi sgt, %52, %c2 : index
%65 = arith.andi %63, %64 : i1
%66 = scf.if %65 -> (vector<3xf32>) {
%93 = vector.transfer_read %50[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<3xf32>
scf.yield %93 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%67 = vector.insert_strided_slice %66, %62 {offsets = [2, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%68 = linalg.init_tensor [1, 1, 3, 3] : tensor<1x1x3x3xf32>
%69 = vector.transfer_write %67, %68[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<3x3xf32>, tensor<1x1x3x3xf32>
%70 = tensor.extract_slice %36[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%71 = vector.transfer_read %70[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<3x4xf32>
%72 = vector.extract_strided_slice %71 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%73 = vector.extract_strided_slice %71 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%74 = vector.extract_strided_slice %71 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%75 = vector.transfer_read %69[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%76 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%77 = vector.extract_strided_slice %75 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %77, %72, %76 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%79 = vector.extract_strided_slice %75 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%80 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %79, %73, %78 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%81 = vector.extract_strided_slice %75 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%82 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %81, %74, %80 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%83 = vector.transfer_write %82, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
%84 = vector.transfer_read %69[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%85 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%86 = vector.extract_strided_slice %84 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%87 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %86, %72, %85 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%88 = vector.extract_strided_slice %84 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%89 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %88, %73, %87 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%90 = vector.extract_strided_slice %84 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%91 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %90, %74, %89 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%92 = vector.transfer_write %91, %83[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
scf.yield %92 : tensor<1x1x2x4xf32>
}
scf.yield %43 : tensor<1x1x2x4xf32>
}
%38 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%39 = vector.transfer_read %37[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%40 = arith.subf %39, %38 : vector<1x1x2x4xf32>
%41 = vector.transfer_write %40, %37[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%42 = tensor.insert_slice %41 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %42 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
--- After peephole optimization ---
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x2x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<3x3xf32>
%cst_1 = arith.constant dense<0.000000e+00> : vector<3xf32>
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%12 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%11, %arg1)
%13 = arith.cmpi eq, %10, %c0 : index
%14 = arith.cmpi eq, %12, %c0 : index
%15 = arith.andi %13, %14 : i1
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.transfer_write %cst, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%29 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%30 = tensor.extract_slice %21[0, 0, %29, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%31 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%32 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %28) -> (tensor<1x1x2x4xf32>) {
%38 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%39 = tensor.extract_slice %30[0, %arg7, %arg9, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%40 = tensor.extract_slice %31[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%41 = vector.transfer_read %40[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<3x4xf32>
%42 = vector.extract_strided_slice %41 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%43 = vector.extract_strided_slice %41 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%44 = vector.extract_strided_slice %41 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%45 = vector.transfer_read %39[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%46 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%47 = vector.extract_strided_slice %45 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %42, %46 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%49 = vector.extract_strided_slice %45 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %43, %48 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%51 = vector.extract_strided_slice %45 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %44, %50 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%53 = vector.transfer_write %52, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
%54 = vector.transfer_read %39[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%55 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%56 = vector.extract_strided_slice %54 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %56, %42, %55 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%58 = vector.extract_strided_slice %54 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %43, %57 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%60 = vector.extract_strided_slice %54 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %44, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%62 = vector.transfer_write %61, %53[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
scf.yield %62 : tensor<1x1x2x4xf32>
}
scf.yield %38 : tensor<1x1x2x4xf32>
}
%33 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%34 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%35 = arith.subf %34, %33 : vector<1x1x2x4xf32>
%36 = vector.transfer_write %35, %32[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%37 = tensor.insert_slice %36 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %37 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.transfer_write %cst, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%29 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%30 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%31 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%30, %29)
%32 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %11, %arg1)
%33 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%11, %arg1, %arg3)
%34 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%33, %32)
%35 = tensor.extract_slice %21[0, %29, %32, 0] [1, %31, %34, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%36 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%37 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %28) -> (tensor<1x1x2x4xf32>) {
%43 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%44 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %30, %29)
%45 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %30, %29)
%46 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%45, %44)
%47 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg9, %33, %32)
%48 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg9, %33, %32)
%49 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%48, %47)
%50 = tensor.extract_slice %35[0, %44, %47, 0] [1, %46, %49, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %46]
%52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %49]
%53 = arith.cmpi sgt, %51, %c0 : index
%54 = arith.cmpi sgt, %52, %c0 : index
%55 = arith.andi %53, %54 : i1
%56 = scf.if %55 -> (vector<3xf32>) {
%93 = vector.transfer_read %50[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<3xf32>
scf.yield %93 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%57 = vector.insert_strided_slice %56, %cst_0 {offsets = [0, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%58 = arith.cmpi sgt, %51, %c0 : index
%59 = arith.cmpi sgt, %52, %c1 : index
%60 = arith.andi %58, %59 : i1
%61 = scf.if %60 -> (vector<3xf32>) {
%93 = vector.transfer_read %50[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<3xf32>
scf.yield %93 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%62 = vector.insert_strided_slice %61, %57 {offsets = [1, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%63 = arith.cmpi sgt, %51, %c0 : index
%64 = arith.cmpi sgt, %52, %c2 : index
%65 = arith.andi %63, %64 : i1
%66 = scf.if %65 -> (vector<3xf32>) {
%93 = vector.transfer_read %50[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<3xf32>
scf.yield %93 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%67 = vector.insert_strided_slice %66, %62 {offsets = [2, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%68 = linalg.init_tensor [1, 1, 3, 3] : tensor<1x1x3x3xf32>
%69 = vector.transfer_write %67, %68[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<3x3xf32>, tensor<1x1x3x3xf32>
%70 = tensor.extract_slice %36[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%71 = vector.transfer_read %70[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<3x4xf32>
%72 = vector.extract_strided_slice %71 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%73 = vector.extract_strided_slice %71 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%74 = vector.extract_strided_slice %71 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%75 = vector.transfer_read %69[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%76 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%77 = vector.extract_strided_slice %75 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %77, %72, %76 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%79 = vector.extract_strided_slice %75 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%80 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %79, %73, %78 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%81 = vector.extract_strided_slice %75 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%82 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %81, %74, %80 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%83 = vector.transfer_write %82, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
%84 = vector.transfer_read %69[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%85 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%86 = vector.extract_strided_slice %84 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%87 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %86, %72, %85 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%88 = vector.extract_strided_slice %84 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%89 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %88, %73, %87 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%90 = vector.extract_strided_slice %84 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%91 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %90, %74, %89 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%92 = vector.transfer_write %91, %83[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
scf.yield %92 : tensor<1x1x2x4xf32>
}
scf.yield %43 : tensor<1x1x2x4xf32>
}
%38 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%39 = vector.transfer_read %37[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%40 = arith.subf %39, %38 : vector<1x1x2x4xf32>
%41 = vector.transfer_write %40, %37[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%42 = tensor.insert_slice %41 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %42 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
--- After lowering multi_reduction ops ---
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x2x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<3x3xf32>
%cst_1 = arith.constant dense<0.000000e+00> : vector<3xf32>
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%12 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%11, %arg1)
%13 = arith.cmpi eq, %10, %c0 : index
%14 = arith.cmpi eq, %12, %c0 : index
%15 = arith.andi %13, %14 : i1
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.transfer_write %cst, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%29 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%30 = tensor.extract_slice %21[0, 0, %29, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%31 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%32 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %28) -> (tensor<1x1x2x4xf32>) {
%38 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%39 = tensor.extract_slice %30[0, %arg7, %arg9, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%40 = tensor.extract_slice %31[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%41 = vector.transfer_read %40[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<3x4xf32>
%42 = vector.extract_strided_slice %41 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%43 = vector.extract_strided_slice %41 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%44 = vector.extract_strided_slice %41 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%45 = vector.transfer_read %39[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%46 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%47 = vector.extract_strided_slice %45 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %42, %46 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%49 = vector.extract_strided_slice %45 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %43, %48 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%51 = vector.extract_strided_slice %45 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %44, %50 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%53 = vector.transfer_write %52, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
%54 = vector.transfer_read %39[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%55 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%56 = vector.extract_strided_slice %54 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %56, %42, %55 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%58 = vector.extract_strided_slice %54 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %43, %57 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%60 = vector.extract_strided_slice %54 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %44, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%62 = vector.transfer_write %61, %53[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
scf.yield %62 : tensor<1x1x2x4xf32>
}
scf.yield %38 : tensor<1x1x2x4xf32>
}
%33 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%34 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%35 = arith.subf %34, %33 : vector<1x1x2x4xf32>
%36 = vector.transfer_write %35, %32[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%37 = tensor.insert_slice %36 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %37 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.transfer_write %cst, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%29 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%30 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%31 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%30, %29)
%32 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %11, %arg1)
%33 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%11, %arg1, %arg3)
%34 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%33, %32)
%35 = tensor.extract_slice %21[0, %29, %32, 0] [1, %31, %34, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%36 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%37 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %28) -> (tensor<1x1x2x4xf32>) {
%43 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%44 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %30, %29)
%45 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %30, %29)
%46 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%45, %44)
%47 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg9, %33, %32)
%48 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg9, %33, %32)
%49 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%48, %47)
%50 = tensor.extract_slice %35[0, %44, %47, 0] [1, %46, %49, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %46]
%52 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %49]
%53 = arith.cmpi sgt, %51, %c0 : index
%54 = arith.cmpi sgt, %52, %c0 : index
%55 = arith.andi %53, %54 : i1
%56 = scf.if %55 -> (vector<3xf32>) {
%93 = vector.transfer_read %50[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<3xf32>
scf.yield %93 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%57 = vector.insert_strided_slice %56, %cst_0 {offsets = [0, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%58 = arith.cmpi sgt, %51, %c0 : index
%59 = arith.cmpi sgt, %52, %c1 : index
%60 = arith.andi %58, %59 : i1
%61 = scf.if %60 -> (vector<3xf32>) {
%93 = vector.transfer_read %50[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<3xf32>
scf.yield %93 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%62 = vector.insert_strided_slice %61, %57 {offsets = [1, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%63 = arith.cmpi sgt, %51, %c0 : index
%64 = arith.cmpi sgt, %52, %c2 : index
%65 = arith.andi %63, %64 : i1
%66 = scf.if %65 -> (vector<3xf32>) {
%93 = vector.transfer_read %50[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<3xf32>
scf.yield %93 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%67 = vector.insert_strided_slice %66, %62 {offsets = [2, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%68 = linalg.init_tensor [1, 1, 3, 3] : tensor<1x1x3x3xf32>
%69 = vector.transfer_write %67, %68[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<3x3xf32>, tensor<1x1x3x3xf32>
%70 = tensor.extract_slice %36[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%71 = vector.transfer_read %70[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<3x4xf32>
%72 = vector.extract_strided_slice %71 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%73 = vector.extract_strided_slice %71 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%74 = vector.extract_strided_slice %71 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%75 = vector.transfer_read %69[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%76 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%77 = vector.extract_strided_slice %75 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%78 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %77, %72, %76 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%79 = vector.extract_strided_slice %75 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%80 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %79, %73, %78 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%81 = vector.extract_strided_slice %75 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%82 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %81, %74, %80 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%83 = vector.transfer_write %82, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
%84 = vector.transfer_read %69[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x3xf32>
%85 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%86 = vector.extract_strided_slice %84 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%87 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %86, %72, %85 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%88 = vector.extract_strided_slice %84 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%89 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %88, %73, %87 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%90 = vector.extract_strided_slice %84 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%91 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %90, %74, %89 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%92 = vector.transfer_write %91, %83[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
scf.yield %92 : tensor<1x1x2x4xf32>
}
scf.yield %43 : tensor<1x1x2x4xf32>
}
%38 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%39 = vector.transfer_read %37[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x2x4xf32>
%40 = arith.subf %39, %38 : vector<1x1x2x4xf32>
%41 = vector.transfer_write %40, %37[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf32>, tensor<1x1x2x4xf32>
%42 = tensor.insert_slice %41 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %42 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
--- After unrolling vector ---
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x2x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<3x3xf32>
%cst_1 = arith.constant dense<0.000000e+00> : vector<3xf32>
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%12 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%11, %arg1)
%13 = arith.cmpi eq, %10, %c0 : index
%14 = arith.cmpi eq, %12, %c0 : index
%15 = arith.andi %13, %14 : i1
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.extract_strided_slice %cst {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x2x4xf32> to vector<1x1x1x4xf32>
%29 = vector.transfer_write %28, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, tensor<1x1x2x4xf32>
%30 = vector.extract_strided_slice %cst {offsets = [0, 0, 1, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x2x4xf32> to vector<1x1x1x4xf32>
%31 = vector.transfer_write %30, %29[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, tensor<1x1x2x4xf32>
%32 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%33 = tensor.extract_slice %21[0, 0, %32, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%34 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%35 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %31) -> (tensor<1x1x2x4xf32>) {
%45 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%46 = tensor.extract_slice %33[0, %arg7, %arg9, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%47 = tensor.extract_slice %34[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%48 = vector.transfer_read %47[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<1x4xf32>
%49 = vector.transfer_read %47[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<1x4xf32>
%50 = vector.transfer_read %47[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<1x4xf32>
%51 = vector.transfer_read %46[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x1xf32>
%52 = vector.transfer_read %46[%c0, %c0, %c0, %c1], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x1xf32>
%53 = vector.transfer_read %46[%c0, %c0, %c0, %c2], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x1xf32>
%54 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%55 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %48, %54 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %49, %55 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %53, %50, %56 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%58 = vector.transfer_write %57, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
%59 = vector.transfer_read %46[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x1xf32>
%60 = vector.transfer_read %46[%c0, %c0, %c2, %c1], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x1xf32>
%61 = vector.transfer_read %46[%c0, %c0, %c2, %c2], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x3xf32>, vector<1x1xf32>
%62 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %48, %62 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%64 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %49, %63 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %61, %50, %64 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%66 = vector.transfer_write %65, %58[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
scf.yield %66 : tensor<1x1x2x4xf32>
}
scf.yield %45 : tensor<1x1x2x4xf32>
}
%36 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x1x4xf32>
%37 = vector.transfer_read %26[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x1x4xf32>
%38 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x1x4xf32>
%39 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x1x4xf32>
%40 = arith.subf %38, %36 : vector<1x1x1x4xf32>
%41 = arith.subf %39, %37 : vector<1x1x1x4xf32>
%42 = vector.transfer_write %40, %35[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, tensor<1x1x2x4xf32>
%43 = vector.transfer_write %41, %42[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, tensor<1x1x2x4xf32>
%44 = tensor.insert_slice %43 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %44 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.extract_strided_slice %cst {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x2x4xf32> to vector<1x1x1x4xf32>
%29 = vector.transfer_write %28, %27[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, tensor<1x1x2x4xf32>
%30 = vector.extract_strided_slice %cst {offsets = [0, 0, 1, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x1x2x4xf32> to vector<1x1x1x4xf32>
%31 = vector.transfer_write %30, %29[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, tensor<1x1x2x4xf32>
%32 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%33 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%34 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%33, %32)
%35 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %11, %arg1)
%36 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%11, %arg1, %arg3)
%37 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%36, %35)
%38 = tensor.extract_slice %21[0, %32, %35, 0] [1, %34, %37, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%39 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%40 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %31) -> (tensor<1x1x2x4xf32>) {
%50 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%51 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %33, %32)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %33, %32)
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %51)
%54 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg9, %36, %35)
%55 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg9, %36, %35)
%56 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%55, %54)
%57 = tensor.extract_slice %38[0, %51, %54, 0] [1, %53, %56, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%58 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %53]
%59 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %56]
%60 = arith.cmpi sgt, %58, %c0 : index
%61 = arith.cmpi sgt, %59, %c0 : index
%62 = arith.andi %60, %61 : i1
%63 = scf.if %62 -> (vector<3xf32>) {
%95 = vector.transfer_read %57[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%96 = vector.insert_strided_slice %95, %cst_1 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%97 = vector.transfer_read %57[%c0, %c0, %c0, %c1], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%98 = vector.insert_strided_slice %97, %96 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%99 = vector.transfer_read %57[%c0, %c0, %c0, %c2], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%100 = vector.insert_strided_slice %99, %98 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %100 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%64 = vector.insert_strided_slice %63, %cst_0 {offsets = [0, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%65 = arith.cmpi sgt, %58, %c0 : index
%66 = arith.cmpi sgt, %59, %c1 : index
%67 = arith.andi %65, %66 : i1
%68 = scf.if %67 -> (vector<3xf32>) {
%95 = vector.transfer_read %57[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%96 = vector.insert_strided_slice %95, %cst_1 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%97 = vector.transfer_read %57[%c0, %c0, %c1, %c1], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%98 = vector.insert_strided_slice %97, %96 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%99 = vector.transfer_read %57[%c0, %c0, %c1, %c2], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%100 = vector.insert_strided_slice %99, %98 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %100 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%69 = vector.insert_strided_slice %68, %64 {offsets = [1, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%70 = arith.cmpi sgt, %58, %c0 : index
%71 = arith.cmpi sgt, %59, %c2 : index
%72 = arith.andi %70, %71 : i1
%73 = scf.if %72 -> (vector<3xf32>) {
%95 = vector.transfer_read %57[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%96 = vector.insert_strided_slice %95, %cst_1 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%97 = vector.transfer_read %57[%c0, %c0, %c2, %c1], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%98 = vector.insert_strided_slice %97, %96 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%99 = vector.transfer_read %57[%c0, %c0, %c2, %c2], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%100 = vector.insert_strided_slice %99, %98 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %100 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%74 = vector.insert_strided_slice %73, %69 {offsets = [2, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%75 = vector.extract_strided_slice %74 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%76 = vector.extract_strided_slice %74 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%77 = vector.extract_strided_slice %74 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%78 = vector.extract_strided_slice %74 {offsets = [2, 0], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%79 = vector.extract_strided_slice %74 {offsets = [2, 1], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%80 = vector.extract_strided_slice %74 {offsets = [2, 2], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%81 = tensor.extract_slice %39[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%82 = vector.transfer_read %81[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<1x4xf32>
%83 = vector.transfer_read %81[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<1x4xf32>
%84 = vector.transfer_read %81[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x3x4xf32>, vector<1x4xf32>
%85 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%86 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %75, %82, %85 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%87 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %76, %83, %86 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%88 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %77, %84, %87 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%89 = vector.transfer_write %88, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
%90 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true]} : tensor<1x1x2x4xf32>, vector<1x4xf32>
%91 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %78, %82, %90 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%92 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %79, %83, %91 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%93 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %80, %84, %92 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%94 = vector.transfer_write %93, %89[%c0, %c0, %c1, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x1x2x4xf32>
scf.yield %94 : tensor<1x1x2x4xf32>
}
scf.yield %50 : tensor<1x1x2x4xf32>
}
%41 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x1x4xf32>
%42 = vector.transfer_read %26[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x1x4xf32>
%43 = vector.transfer_read %40[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x1x4xf32>
%44 = vector.transfer_read %40[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true, true, true, true]} : tensor<1x1x2x4xf32>, vector<1x1x1x4xf32>
%45 = arith.subf %43, %41 : vector<1x1x1x4xf32>
%46 = arith.subf %44, %42 : vector<1x1x1x4xf32>
%47 = vector.transfer_write %45, %40[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, tensor<1x1x2x4xf32>
%48 = vector.transfer_write %46, %47[%c0, %c0, %c1, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x4xf32>, tensor<1x1x2x4xf32>
%49 = tensor.insert_slice %48 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %49 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
--- After casting away leading size-1 dims ---
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%cst = arith.constant dense<0.000000e+00> : vector<1x1x2x4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<3x3xf32>
%cst_1 = arith.constant dense<0.000000e+00> : vector<3xf32>
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst_2 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%12 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%11, %arg1)
%13 = arith.cmpi eq, %10, %c0 : index
%14 = arith.cmpi eq, %12, %c0 : index
%15 = arith.andi %13, %14 : i1
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.extract %cst[0, 0, 0] : vector<1x1x2x4xf32>
%29 = vector.transfer_write %28, %27[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%30 = vector.extract %cst[0, 0, 1] : vector<1x1x2x4xf32>
%31 = vector.transfer_write %30, %29[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%32 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%33 = tensor.extract_slice %21[0, 0, %32, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%34 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%35 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %31) -> (tensor<1x1x2x4xf32>) {
%45 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%46 = tensor.extract_slice %33[0, %arg7, %arg9, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%47 = tensor.extract_slice %34[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%48 = vector.transfer_read %47[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%49 = vector.broadcast %48 : vector<4xf32> to vector<1x4xf32>
%50 = vector.transfer_read %47[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%51 = vector.broadcast %50 : vector<4xf32> to vector<1x4xf32>
%52 = vector.transfer_read %47[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%53 = vector.broadcast %52 : vector<4xf32> to vector<1x4xf32>
%54 = vector.transfer_read %46[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%55 = vector.transfer_read %46[%c0, %c0, %c0, %c1], %cst_2 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%56 = vector.transfer_read %46[%c0, %c0, %c0, %c2], %cst_2 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%57 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%58 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %54, %49, %57 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %55, %51, %58 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%60 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %56, %53, %59 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%61 = vector.transfer_write %60, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%62 = vector.transfer_read %46[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%63 = vector.transfer_read %46[%c0, %c0, %c2, %c1], %cst_2 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%64 = vector.transfer_read %46[%c0, %c0, %c2, %c2], %cst_2 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%65 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %62, %49, %65 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%67 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %63, %51, %66 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%68 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %64, %53, %67 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%69 = vector.transfer_write %68, %61[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
scf.yield %69 : tensor<1x1x2x4xf32>
}
scf.yield %45 : tensor<1x1x2x4xf32>
}
%36 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%37 = vector.transfer_read %26[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%38 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%39 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%40 = arith.subf %38, %36 : vector<4xf32>
%41 = arith.subf %39, %37 : vector<4xf32>
%42 = vector.transfer_write %40, %35[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%43 = vector.transfer_write %41, %42[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%44 = tensor.insert_slice %43 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %44 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.extract %cst[0, 0, 0] : vector<1x1x2x4xf32>
%29 = vector.transfer_write %28, %27[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%30 = vector.extract %cst[0, 0, 1] : vector<1x1x2x4xf32>
%31 = vector.transfer_write %30, %29[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%32 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%33 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%34 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%33, %32)
%35 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %11, %arg1)
%36 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%11, %arg1, %arg3)
%37 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%36, %35)
%38 = tensor.extract_slice %21[0, %32, %35, 0] [1, %34, %37, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%39 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%40 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %31) -> (tensor<1x1x2x4xf32>) {
%50 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%51 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %33, %32)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %33, %32)
%53 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%52, %51)
%54 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg9, %36, %35)
%55 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg9, %36, %35)
%56 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%55, %54)
%57 = tensor.extract_slice %38[0, %51, %54, 0] [1, %53, %56, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%58 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %53]
%59 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %56]
%60 = arith.cmpi sgt, %58, %c0 : index
%61 = arith.cmpi sgt, %59, %c0 : index
%62 = arith.andi %60, %61 : i1
%63 = scf.if %62 -> (vector<3xf32>) {
%104 = vector.transfer_read %57[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%105 = vector.insert_strided_slice %104, %cst_1 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%106 = vector.transfer_read %57[%c0, %c0, %c0, %c1], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%107 = vector.insert_strided_slice %106, %105 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%108 = vector.transfer_read %57[%c0, %c0, %c0, %c2], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%109 = vector.insert_strided_slice %108, %107 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %109 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%64 = vector.insert_strided_slice %63, %cst_0 {offsets = [0, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%65 = arith.cmpi sgt, %58, %c0 : index
%66 = arith.cmpi sgt, %59, %c1 : index
%67 = arith.andi %65, %66 : i1
%68 = scf.if %67 -> (vector<3xf32>) {
%104 = vector.transfer_read %57[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%105 = vector.insert_strided_slice %104, %cst_1 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%106 = vector.transfer_read %57[%c0, %c0, %c1, %c1], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%107 = vector.insert_strided_slice %106, %105 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%108 = vector.transfer_read %57[%c0, %c0, %c1, %c2], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%109 = vector.insert_strided_slice %108, %107 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %109 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%69 = vector.insert_strided_slice %68, %64 {offsets = [1, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%70 = arith.cmpi sgt, %58, %c0 : index
%71 = arith.cmpi sgt, %59, %c2 : index
%72 = arith.andi %70, %71 : i1
%73 = scf.if %72 -> (vector<3xf32>) {
%104 = vector.transfer_read %57[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%105 = vector.insert_strided_slice %104, %cst_1 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%106 = vector.transfer_read %57[%c0, %c0, %c2, %c1], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%107 = vector.insert_strided_slice %106, %105 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%108 = vector.transfer_read %57[%c0, %c0, %c2, %c2], %cst_2 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%109 = vector.insert_strided_slice %108, %107 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %109 : vector<3xf32>
} else {
scf.yield %cst_1 : vector<3xf32>
}
%74 = vector.insert_strided_slice %73, %69 {offsets = [2, 0], strides = [1]} : vector<3xf32> into vector<3x3xf32>
%75 = vector.extract_strided_slice %74 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%76 = vector.extract_strided_slice %74 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%77 = vector.extract_strided_slice %74 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%78 = vector.extract_strided_slice %74 {offsets = [2, 0], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%79 = vector.extract_strided_slice %74 {offsets = [2, 1], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%80 = vector.extract_strided_slice %74 {offsets = [2, 2], sizes = [1, 1], strides = [1, 1]} : vector<3x3xf32> to vector<1x1xf32>
%81 = tensor.extract_slice %39[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%82 = vector.transfer_read %81[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%83 = vector.broadcast %82 : vector<4xf32> to vector<1x4xf32>
%84 = vector.transfer_read %81[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%85 = vector.broadcast %84 : vector<4xf32> to vector<1x4xf32>
%86 = vector.transfer_read %81[%c0, %c0, %c2, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%87 = vector.broadcast %86 : vector<4xf32> to vector<1x4xf32>
%88 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%89 = vector.extract %75[0] : vector<1x1xf32>
%90 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %89, %83, %88 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%91 = vector.extract %76[0] : vector<1x1xf32>
%92 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %91, %85, %90 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%93 = vector.extract %77[0] : vector<1x1xf32>
%94 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %93, %87, %92 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%95 = vector.transfer_write %94, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%96 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%97 = vector.extract %78[0] : vector<1x1xf32>
%98 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %97, %83, %96 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%99 = vector.extract %79[0] : vector<1x1xf32>
%100 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %99, %85, %98 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%101 = vector.extract %80[0] : vector<1x1xf32>
%102 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %101, %87, %100 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%103 = vector.transfer_write %102, %95[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
scf.yield %103 : tensor<1x1x2x4xf32>
}
scf.yield %50 : tensor<1x1x2x4xf32>
}
%41 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%42 = vector.transfer_read %26[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%43 = vector.transfer_read %40[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%44 = vector.transfer_read %40[%c0, %c0, %c1, %c0], %cst_2 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%45 = arith.subf %43, %41 : vector<4xf32>
%46 = arith.subf %44, %42 : vector<4xf32>
%47 = vector.transfer_write %45, %40[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%48 = vector.transfer_write %46, %47[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%49 = tensor.insert_slice %48 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %49 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
--- After breaking down n-D inserts/extracts ---
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%cst = arith.constant dense<0.000000e+00> : vector<4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%12 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%11, %arg1)
%13 = arith.cmpi eq, %10, %c0 : index
%14 = arith.cmpi eq, %12, %c0 : index
%15 = arith.andi %13, %14 : i1
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.transfer_write %cst, %27[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%29 = vector.transfer_write %cst, %28[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%30 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%31 = tensor.extract_slice %21[0, 0, %30, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%32 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%33 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %29) -> (tensor<1x1x2x4xf32>) {
%43 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%44 = tensor.extract_slice %31[0, %arg7, %arg9, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%45 = tensor.extract_slice %32[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%46 = vector.transfer_read %45[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%47 = vector.broadcast %46 : vector<4xf32> to vector<1x4xf32>
%48 = vector.transfer_read %45[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%49 = vector.broadcast %48 : vector<4xf32> to vector<1x4xf32>
%50 = vector.transfer_read %45[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%51 = vector.broadcast %50 : vector<4xf32> to vector<1x4xf32>
%52 = vector.transfer_read %44[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%53 = vector.transfer_read %44[%c0, %c0, %c0, %c1], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%54 = vector.transfer_read %44[%c0, %c0, %c0, %c2], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%55 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %52, %47, %55 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %53, %49, %56 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%58 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %54, %51, %57 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%59 = vector.transfer_write %58, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%60 = vector.transfer_read %44[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%61 = vector.transfer_read %44[%c0, %c0, %c2, %c1], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%62 = vector.transfer_read %44[%c0, %c0, %c2, %c2], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%63 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%64 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %60, %47, %63 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %61, %49, %64 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %62, %51, %65 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%67 = vector.transfer_write %66, %59[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
scf.yield %67 : tensor<1x1x2x4xf32>
}
scf.yield %43 : tensor<1x1x2x4xf32>
}
%34 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%35 = vector.transfer_read %26[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%37 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%38 = arith.subf %36, %34 : vector<4xf32>
%39 = arith.subf %37, %35 : vector<4xf32>
%40 = vector.transfer_write %38, %33[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%41 = vector.transfer_write %39, %40[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%42 = tensor.insert_slice %41 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %42 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%11, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%26 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%27 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%28 = vector.transfer_write %cst, %27[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%29 = vector.transfer_write %cst, %28[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%30 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%31 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%32 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%31, %30)
%33 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %11, %arg1)
%34 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%11, %arg1, %arg3)
%35 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%34, %33)
%36 = tensor.extract_slice %21[0, %30, %33, 0] [1, %32, %35, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%37 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%38 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %29) -> (tensor<1x1x2x4xf32>) {
%48 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %arg8) -> (tensor<1x1x2x4xf32>) {
%49 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %31, %30)
%50 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %31, %30)
%51 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%50, %49)
%52 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg9, %34, %33)
%53 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg9, %34, %33)
%54 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%53, %52)
%55 = tensor.extract_slice %36[0, %49, %52, 0] [1, %51, %54, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%56 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %51]
%57 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %54]
%58 = arith.cmpi sgt, %56, %c0 : index
%59 = arith.cmpi sgt, %57, %c0 : index
%60 = arith.andi %58, %59 : i1
%61 = scf.if %60 -> (vector<3xf32>) {
%89 = vector.transfer_read %55[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%90 = vector.insert_strided_slice %89, %cst_0 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%91 = vector.transfer_read %55[%c0, %c0, %c0, %c1], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%92 = vector.insert_strided_slice %91, %90 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%93 = vector.transfer_read %55[%c0, %c0, %c0, %c2], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%94 = vector.insert_strided_slice %93, %92 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %94 : vector<3xf32>
} else {
scf.yield %cst_0 : vector<3xf32>
}
%62 = arith.cmpi sgt, %56, %c0 : index
%63 = arith.cmpi sgt, %57, %c2 : index
%64 = arith.andi %62, %63 : i1
%65 = scf.if %64 -> (vector<3xf32>) {
%89 = vector.transfer_read %55[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%90 = vector.insert_strided_slice %89, %cst_0 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%91 = vector.transfer_read %55[%c0, %c0, %c2, %c1], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%92 = vector.insert_strided_slice %91, %90 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%93 = vector.transfer_read %55[%c0, %c0, %c2, %c2], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%94 = vector.insert_strided_slice %93, %92 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %94 : vector<3xf32>
} else {
scf.yield %cst_0 : vector<3xf32>
}
%66 = vector.extract_strided_slice %61 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%67 = vector.extract_strided_slice %61 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%68 = vector.extract_strided_slice %61 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%69 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%70 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%71 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%72 = tensor.extract_slice %37[%arg7, %arg9, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%73 = vector.transfer_read %72[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%74 = vector.broadcast %73 : vector<4xf32> to vector<1x4xf32>
%75 = vector.transfer_read %72[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%76 = vector.broadcast %75 : vector<4xf32> to vector<1x4xf32>
%77 = vector.transfer_read %72[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%78 = vector.broadcast %77 : vector<4xf32> to vector<1x4xf32>
%79 = vector.transfer_read %arg10[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%80 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %66, %74, %79 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%81 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %67, %76, %80 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%82 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %68, %78, %81 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%83 = vector.transfer_write %82, %arg10[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%84 = vector.transfer_read %arg10[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%85 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %69, %74, %84 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%86 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %70, %76, %85 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%87 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %71, %78, %86 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%88 = vector.transfer_write %87, %83[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
scf.yield %88 : tensor<1x1x2x4xf32>
}
scf.yield %48 : tensor<1x1x2x4xf32>
}
%39 = vector.transfer_read %26[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%40 = vector.transfer_read %26[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%41 = vector.transfer_read %38[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%42 = vector.transfer_read %38[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%43 = arith.subf %41, %39 : vector<4xf32>
%44 = arith.subf %42, %40 : vector<4xf32>
%45 = vector.transfer_write %43, %38[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%46 = vector.transfer_write %44, %45[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%47 = tensor.insert_slice %46 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %47 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %25 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
--- After hoisting vector transfers ---
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%cst = arith.constant dense<0.000000e+00> : vector<4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = arith.cmpi eq, %10, %c0 : index
scf.for %arg1 = %5 to %c112 step %6 {
%12 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%13 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%12, %arg1)
%14 = arith.cmpi eq, %13, %c0 : index
%15 = arith.andi %11, %14 : i1
scf.for %arg2 = %7 to %c32 step %8 {
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%12, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%26 = tensor.extract_slice %21[0, 0, %25, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%27 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%28 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%29 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%30 = vector.transfer_write %cst, %29[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%31 = vector.transfer_write %cst, %30[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%32 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%33:2 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %cst, %arg9 = %cst) -> (vector<4xf32>, vector<4xf32>) {
%43:2 = scf.for %arg10 = %c0 to %c3 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (vector<4xf32>, vector<4xf32>) {
%44 = tensor.extract_slice %26[0, %arg7, %arg10, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%45 = tensor.extract_slice %32[%arg7, %arg10, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%46 = vector.transfer_read %45[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%47 = vector.broadcast %46 : vector<4xf32> to vector<1x4xf32>
%48 = vector.transfer_read %45[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%49 = vector.broadcast %48 : vector<4xf32> to vector<1x4xf32>
%50 = vector.transfer_read %45[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%51 = vector.broadcast %50 : vector<4xf32> to vector<1x4xf32>
%52 = vector.transfer_read %44[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%53 = vector.transfer_read %44[%c0, %c0, %c0, %c1], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%54 = vector.transfer_read %44[%c0, %c0, %c0, %c2], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%55 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %52, %47, %arg12 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %53, %49, %55 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %54, %51, %56 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%58 = vector.transfer_read %44[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%59 = vector.transfer_read %44[%c0, %c0, %c2, %c1], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%60 = vector.transfer_read %44[%c0, %c0, %c2, %c2], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %58, %47, %arg11 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %59, %49, %61 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %60, %51, %62 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
scf.yield %63, %57 : vector<4xf32>, vector<4xf32>
}
scf.yield %43#0, %43#1 : vector<4xf32>, vector<4xf32>
}
%34 = vector.transfer_write %33#1, %31[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%35 = vector.transfer_write %33#0, %34[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%36 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%37 = vector.transfer_read %28[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%38 = arith.subf %33#1, %36 : vector<4xf32>
%39 = arith.subf %33#0, %37 : vector<4xf32>
%40 = vector.transfer_write %38, %35[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%41 = vector.transfer_write %39, %40[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%42 = tensor.insert_slice %41 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %42 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %27 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%12, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%25 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%26 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%25, %24)
%27 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%28 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %12, %arg1)
%29 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%12, %arg1, %arg3)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %28)
%31 = tensor.extract_slice %21[0, %24, %28, 0] [1, %26, %30, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%32 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%33 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%34 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%35 = vector.transfer_write %cst, %34[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%36 = vector.transfer_write %cst, %35[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%37 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%38:2 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %cst, %arg9 = %cst) -> (vector<4xf32>, vector<4xf32>) {
%48 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %25, %24)
%49 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %25, %24)
%50 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%49, %48)
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %50]
%52 = arith.cmpi sgt, %51, %c0 : index
%53 = arith.cmpi sgt, %51, %c0 : index
%54:2 = scf.for %arg10 = %c0 to %c3 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (vector<4xf32>, vector<4xf32>) {
%55 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg10, %29, %28)
%56 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg10, %29, %28)
%57 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%56, %55)
%58 = tensor.extract_slice %31[0, %48, %55, 0] [1, %50, %57, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%59 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %57]
%60 = arith.cmpi sgt, %59, %c0 : index
%61 = arith.andi %52, %60 : i1
%62 = scf.if %61 -> (vector<3xf32>) {
%85 = vector.transfer_read %58[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%86 = vector.insert_strided_slice %85, %cst_0 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%87 = vector.transfer_read %58[%c0, %c0, %c0, %c1], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%88 = vector.insert_strided_slice %87, %86 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%89 = vector.transfer_read %58[%c0, %c0, %c0, %c2], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%90 = vector.insert_strided_slice %89, %88 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %90 : vector<3xf32>
} else {
scf.yield %cst_0 : vector<3xf32>
}
%63 = arith.cmpi sgt, %59, %c2 : index
%64 = arith.andi %53, %63 : i1
%65 = scf.if %64 -> (vector<3xf32>) {
%85 = vector.transfer_read %58[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%86 = vector.insert_strided_slice %85, %cst_0 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%87 = vector.transfer_read %58[%c0, %c0, %c2, %c1], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%88 = vector.insert_strided_slice %87, %86 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%89 = vector.transfer_read %58[%c0, %c0, %c2, %c2], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%90 = vector.insert_strided_slice %89, %88 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %90 : vector<3xf32>
} else {
scf.yield %cst_0 : vector<3xf32>
}
%66 = vector.extract_strided_slice %62 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%67 = vector.extract_strided_slice %62 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%68 = vector.extract_strided_slice %62 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%69 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%70 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%71 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%72 = tensor.extract_slice %37[%arg7, %arg10, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%73 = vector.transfer_read %72[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%74 = vector.broadcast %73 : vector<4xf32> to vector<1x4xf32>
%75 = vector.transfer_read %72[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%76 = vector.broadcast %75 : vector<4xf32> to vector<1x4xf32>
%77 = vector.transfer_read %72[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%78 = vector.broadcast %77 : vector<4xf32> to vector<1x4xf32>
%79 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %66, %74, %arg12 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%80 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %67, %76, %79 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%81 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %68, %78, %80 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%82 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %69, %74, %arg11 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%83 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %70, %76, %82 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%84 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %71, %78, %83 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
scf.yield %84, %81 : vector<4xf32>, vector<4xf32>
}
scf.yield %54#0, %54#1 : vector<4xf32>, vector<4xf32>
}
%39 = vector.transfer_write %38#1, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%40 = vector.transfer_write %38#0, %39[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%41 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%42 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%43 = arith.subf %38#1, %41 : vector<4xf32>
%44 = arith.subf %38#0, %42 : vector<4xf32>
%45 = vector.transfer_write %43, %40[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%46 = vector.transfer_write %44, %45[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%47 = tensor.insert_slice %46 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %47 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %32 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %27, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
--- After lowering transfer ops ---
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%cst = arith.constant dense<0.000000e+00> : vector<4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = arith.cmpi eq, %10, %c0 : index
scf.for %arg1 = %5 to %c112 step %6 {
%12 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%13 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%12, %arg1)
%14 = arith.cmpi eq, %13, %c0 : index
%15 = arith.andi %11, %14 : i1
scf.for %arg2 = %7 to %c32 step %8 {
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%12, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%26 = tensor.extract_slice %21[0, 0, %25, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%27 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%28 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%29 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%30 = vector.transfer_write %cst, %29[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%31 = vector.transfer_write %cst, %30[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%32 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%33:2 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %cst, %arg9 = %cst) -> (vector<4xf32>, vector<4xf32>) {
%43:2 = scf.for %arg10 = %c0 to %c3 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (vector<4xf32>, vector<4xf32>) {
%44 = tensor.extract_slice %26[0, %arg7, %arg10, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%45 = tensor.extract_slice %32[%arg7, %arg10, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%46 = vector.transfer_read %45[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%47 = vector.broadcast %46 : vector<4xf32> to vector<1x4xf32>
%48 = vector.transfer_read %45[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%49 = vector.broadcast %48 : vector<4xf32> to vector<1x4xf32>
%50 = vector.transfer_read %45[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%51 = vector.broadcast %50 : vector<4xf32> to vector<1x4xf32>
%52 = vector.transfer_read %44[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%53 = vector.transfer_read %44[%c0, %c0, %c0, %c1], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%54 = vector.transfer_read %44[%c0, %c0, %c0, %c2], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%55 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %52, %47, %arg12 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %53, %49, %55 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %54, %51, %56 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%58 = vector.transfer_read %44[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%59 = vector.transfer_read %44[%c0, %c0, %c2, %c1], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%60 = vector.transfer_read %44[%c0, %c0, %c2, %c2], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %58, %47, %arg11 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %59, %49, %61 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %60, %51, %62 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
scf.yield %63, %57 : vector<4xf32>, vector<4xf32>
}
scf.yield %43#0, %43#1 : vector<4xf32>, vector<4xf32>
}
%34 = vector.transfer_write %33#1, %31[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%35 = vector.transfer_write %33#0, %34[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%36 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%37 = vector.transfer_read %28[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%38 = arith.subf %33#1, %36 : vector<4xf32>
%39 = arith.subf %33#0, %37 : vector<4xf32>
%40 = vector.transfer_write %38, %35[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%41 = vector.transfer_write %39, %40[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%42 = tensor.insert_slice %41 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %42 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %27 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%12, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%25 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%26 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%25, %24)
%27 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%28 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %12, %arg1)
%29 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%12, %arg1, %arg3)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %28)
%31 = tensor.extract_slice %21[0, %24, %28, 0] [1, %26, %30, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%32 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%33 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%34 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%35 = vector.transfer_write %cst, %34[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%36 = vector.transfer_write %cst, %35[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%37 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%38:2 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %cst, %arg9 = %cst) -> (vector<4xf32>, vector<4xf32>) {
%48 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %25, %24)
%49 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %25, %24)
%50 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%49, %48)
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %50]
%52 = arith.cmpi sgt, %51, %c0 : index
%53 = arith.cmpi sgt, %51, %c0 : index
%54:2 = scf.for %arg10 = %c0 to %c3 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (vector<4xf32>, vector<4xf32>) {
%55 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg10, %29, %28)
%56 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg10, %29, %28)
%57 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%56, %55)
%58 = tensor.extract_slice %31[0, %48, %55, 0] [1, %50, %57, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%59 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %57]
%60 = arith.cmpi sgt, %59, %c0 : index
%61 = arith.andi %52, %60 : i1
%62 = scf.if %61 -> (vector<3xf32>) {
%85 = vector.transfer_read %58[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%86 = vector.insert_strided_slice %85, %cst_0 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%87 = vector.transfer_read %58[%c0, %c0, %c0, %c1], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%88 = vector.insert_strided_slice %87, %86 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%89 = vector.transfer_read %58[%c0, %c0, %c0, %c2], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%90 = vector.insert_strided_slice %89, %88 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %90 : vector<3xf32>
} else {
scf.yield %cst_0 : vector<3xf32>
}
%63 = arith.cmpi sgt, %59, %c2 : index
%64 = arith.andi %53, %63 : i1
%65 = scf.if %64 -> (vector<3xf32>) {
%85 = vector.transfer_read %58[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%86 = vector.insert_strided_slice %85, %cst_0 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%87 = vector.transfer_read %58[%c0, %c0, %c2, %c1], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%88 = vector.insert_strided_slice %87, %86 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%89 = vector.transfer_read %58[%c0, %c0, %c2, %c2], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%90 = vector.insert_strided_slice %89, %88 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %90 : vector<3xf32>
} else {
scf.yield %cst_0 : vector<3xf32>
}
%66 = vector.extract_strided_slice %62 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%67 = vector.extract_strided_slice %62 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%68 = vector.extract_strided_slice %62 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%69 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%70 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%71 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%72 = tensor.extract_slice %37[%arg7, %arg10, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%73 = vector.transfer_read %72[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%74 = vector.broadcast %73 : vector<4xf32> to vector<1x4xf32>
%75 = vector.transfer_read %72[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%76 = vector.broadcast %75 : vector<4xf32> to vector<1x4xf32>
%77 = vector.transfer_read %72[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%78 = vector.broadcast %77 : vector<4xf32> to vector<1x4xf32>
%79 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %66, %74, %arg12 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%80 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %67, %76, %79 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%81 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %68, %78, %80 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%82 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %69, %74, %arg11 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%83 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %70, %76, %82 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
%84 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %71, %78, %83 : vector<1xf32>, vector<1x4xf32> into vector<4xf32>
scf.yield %84, %81 : vector<4xf32>, vector<4xf32>
}
scf.yield %54#0, %54#1 : vector<4xf32>, vector<4xf32>
}
%39 = vector.transfer_write %38#1, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%40 = vector.transfer_write %38#0, %39[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%41 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%42 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%43 = arith.subf %38#1, %41 : vector<4xf32>
%44 = arith.subf %38#0, %42 : vector<4xf32>
%45 = vector.transfer_write %43, %40[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%46 = vector.transfer_write %44, %45[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%47 = tensor.insert_slice %46 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %47 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %32 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %27, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
}
return
}
--- After lowering contract ops ---
func.func @conv_pad_dispatch_0_conv_2d_nhwc_hwcf_1x112x112x32x3x3x3() {
%cst = arith.constant dense<0.000000e+00> : vector<4xf32>
%cst_0 = arith.constant dense<0.000000e+00> : vector<3xf32>
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%cst_1 = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x224x224x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x112x112x32xf32>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 224, 224, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x224x224x3xf32> -> tensor<1x224x224x3xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg0 = %workgroup_id_z to %c112 step %workgroup_count_z {
%9 = affine.min affine_map<(d0) -> (224, d0 * 2 + 3)>(%arg0)
%10 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 3)>(%9, %arg0)
%11 = arith.cmpi eq, %10, %c0 : index
scf.for %arg1 = %5 to %c112 step %6 {
%12 = affine.min affine_map<(d0) -> (224, d0 * 2 + 9)>(%arg1)
%13 = affine.apply affine_map<(d0, d1) -> (-d0 + d1 * 2 + 9)>(%12, %arg1)
%14 = arith.cmpi eq, %13, %c0 : index
%15 = arith.andi %11, %14 : i1
scf.for %arg2 = %7 to %c32 step %8 {
scf.if %15 {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%12, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%25 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg3)
%26 = tensor.extract_slice %21[0, 0, %25, 0] [1, 3, 5, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x3x5x3xf32>
%27 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%28 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%29 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%30 = vector.transfer_write %cst, %29[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%31 = vector.transfer_write %cst, %30[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%32 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%33:2 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %cst, %arg9 = %cst) -> (vector<4xf32>, vector<4xf32>) {
%43:2 = scf.for %arg10 = %c0 to %c3 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (vector<4xf32>, vector<4xf32>) {
%44 = tensor.extract_slice %26[0, %arg7, %arg10, 0] [1, 1, 3, 3] [1, 1, 1, 1] : tensor<1x3x5x3xf32> to tensor<1x1x3x3xf32>
%45 = tensor.extract_slice %32[%arg7, %arg10, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%46 = vector.transfer_read %45[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%47 = vector.transfer_read %45[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%48 = vector.transfer_read %45[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%49 = vector.transfer_read %44[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%50 = vector.transfer_read %44[%c0, %c0, %c0, %c1], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%51 = vector.transfer_read %44[%c0, %c0, %c0, %c2], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%52 = vector.broadcast %49 : vector<1xf32> to vector<1x4xf32>
%53 = vector.extract %52[0] : vector<1x4xf32>
%54 = vector.fma %53, %46, %arg12 : vector<4xf32>
%55 = vector.broadcast %50 : vector<1xf32> to vector<1x4xf32>
%56 = vector.extract %55[0] : vector<1x4xf32>
%57 = vector.fma %56, %47, %54 : vector<4xf32>
%58 = vector.broadcast %51 : vector<1xf32> to vector<1x4xf32>
%59 = vector.extract %58[0] : vector<1x4xf32>
%60 = vector.fma %59, %48, %57 : vector<4xf32>
%61 = vector.transfer_read %44[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%62 = vector.transfer_read %44[%c0, %c0, %c2, %c1], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%63 = vector.transfer_read %44[%c0, %c0, %c2, %c2], %cst_1 {in_bounds = [true]} : tensor<1x1x3x3xf32>, vector<1xf32>
%64 = vector.broadcast %61 : vector<1xf32> to vector<1x4xf32>
%65 = vector.extract %64[0] : vector<1x4xf32>
%66 = vector.fma %65, %46, %arg11 : vector<4xf32>
%67 = vector.broadcast %62 : vector<1xf32> to vector<1x4xf32>
%68 = vector.extract %67[0] : vector<1x4xf32>
%69 = vector.fma %68, %47, %66 : vector<4xf32>
%70 = vector.broadcast %63 : vector<1xf32> to vector<1x4xf32>
%71 = vector.extract %70[0] : vector<1x4xf32>
%72 = vector.fma %71, %48, %69 : vector<4xf32>
scf.yield %72, %60 : vector<4xf32>, vector<4xf32>
}
scf.yield %43#0, %43#1 : vector<4xf32>, vector<4xf32>
}
%34 = vector.transfer_write %33#1, %31[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%35 = vector.transfer_write %33#0, %34[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%36 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%37 = vector.transfer_read %28[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x2x4xf32>, vector<4xf32>
%38 = arith.subf %33#1, %36 : vector<4xf32>
%39 = arith.subf %33#0, %37 : vector<4xf32>
%40 = vector.transfer_write %38, %35[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%41 = vector.transfer_write %39, %40[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%42 = tensor.insert_slice %41 into %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x2x4xf32> into tensor<1x1x4x32xf32>
scf.yield %42 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 0 : index}
scf.yield %27 : tensor<1x1x4x32xf32>
} {iree.spirv.distribute_dim = 1 : index}
flow.dispatch.tensor.store %24, %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : tensor<1x1x4x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
} else {
%16 = flow.dispatch.tensor.load %3, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%17 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%18 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%9, %arg0)
%19 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%20 = affine.apply affine_map<(d0, d1) -> (d0 - d1 * 2)>(%12, %arg1)
%21 = tensor.extract_slice %4[0, %17, %19, 0] [1, %18, %20, 3] [1, 1, 1, 1] : tensor<1x224x224x3xf32> to tensor<1x?x?x3xf32>
%22 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%23 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 1, 4, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x112x112x32xf32> -> tensor<1x1x4x32xf32>
%24 = affine.min affine_map<(d0, d1) -> (0, d0 - d1 * 2)>(%9, %arg0)
%25 = affine.min affine_map<(d0, d1) -> (3, d0 - d1 * 2)>(%9, %arg0)
%26 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%25, %24)
%27 = scf.for %arg3 = %c0 to %c4 step %c2 iter_args(%arg4 = %16) -> (tensor<1x1x4x32xf32>) {
%28 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2 * 2, d0 * 2)>(%arg3, %12, %arg1)
%29 = affine.min affine_map<(d0, d1, d2) -> (d2 * 2 + 5, d0 - d1 * 2)>(%12, %arg1, %arg3)
%30 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%29, %28)
%31 = tensor.extract_slice %21[0, %24, %28, 0] [1, %26, %30, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%32 = scf.for %arg5 = %c0 to %c32 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x32xf32>) {
%33 = tensor.extract_slice %23[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%34 = tensor.extract_slice %arg6[0, 0, %arg3, %arg5] [1, 1, 2, 4] [1, 1, 1, 1] : tensor<1x1x4x32xf32> to tensor<1x1x2x4xf32>
%35 = vector.transfer_write %cst, %34[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%36 = vector.transfer_write %cst, %35[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%37 = tensor.extract_slice %22[0, 0, 0, %arg5] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
%38:2 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %cst, %arg9 = %cst) -> (vector<4xf32>, vector<4xf32>) {
%48 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg7, %25, %24)
%49 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 1)>(%arg7, %25, %24)
%50 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%49, %48)
%51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %50]
%52 = arith.cmpi sgt, %51, %c0 : index
%53 = arith.cmpi sgt, %51, %c0 : index
%54:2 = scf.for %arg10 = %c0 to %c3 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (vector<4xf32>, vector<4xf32>) {
%55 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0)>(%arg10, %29, %28)
%56 = affine.min affine_map<(d0, d1, d2) -> (d1 - d2, d0 + 3)>(%arg10, %29, %28)
%57 = affine.apply affine_map<(d0, d1) -> (d0 - d1)>(%56, %55)
%58 = tensor.extract_slice %31[0, %48, %55, 0] [1, %50, %57, 3] [1, 1, 1, 1] : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%59 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%c0, %57]
%60 = arith.cmpi sgt, %59, %c0 : index
%61 = arith.andi %52, %60 : i1
%62 = scf.if %61 -> (vector<3xf32>) {
%94 = vector.transfer_read %58[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%95 = vector.insert_strided_slice %94, %cst_0 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%96 = vector.transfer_read %58[%c0, %c0, %c0, %c1], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%97 = vector.insert_strided_slice %96, %95 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%98 = vector.transfer_read %58[%c0, %c0, %c0, %c2], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%99 = vector.insert_strided_slice %98, %97 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %99 : vector<3xf32>
} else {
scf.yield %cst_0 : vector<3xf32>
}
%63 = arith.cmpi sgt, %59, %c2 : index
%64 = arith.andi %53, %63 : i1
%65 = scf.if %64 -> (vector<3xf32>) {
%94 = vector.transfer_read %58[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%95 = vector.insert_strided_slice %94, %cst_0 {offsets = [0], strides = [1]} : vector<1xf32> into vector<3xf32>
%96 = vector.transfer_read %58[%c0, %c0, %c2, %c1], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%97 = vector.insert_strided_slice %96, %95 {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
%98 = vector.transfer_read %58[%c0, %c0, %c2, %c2], %cst_1 {in_bounds = [true]} : tensor<1x?x?x3xf32>, vector<1xf32>
%99 = vector.insert_strided_slice %98, %97 {offsets = [2], strides = [1]} : vector<1xf32> into vector<3xf32>
scf.yield %99 : vector<3xf32>
} else {
scf.yield %cst_0 : vector<3xf32>
}
%66 = vector.extract_strided_slice %62 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%67 = vector.extract_strided_slice %62 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%68 = vector.extract_strided_slice %62 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%69 = vector.extract_strided_slice %65 {offsets = [0], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%70 = vector.extract_strided_slice %65 {offsets = [1], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%71 = vector.extract_strided_slice %65 {offsets = [2], sizes = [1], strides = [1]} : vector<3xf32> to vector<1xf32>
%72 = tensor.extract_slice %37[%arg7, %arg10, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x4xf32> to tensor<1x1x3x4xf32>
%73 = vector.transfer_read %72[%c0, %c0, %c0, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%74 = vector.transfer_read %72[%c0, %c0, %c1, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%75 = vector.transfer_read %72[%c0, %c0, %c2, %c0], %cst_1 {in_bounds = [true]} : tensor<1x1x3x4xf32>, vector<4xf32>
%76 = vector.broadcast %66 : vector<1xf32> to vector<1x4xf32>
%77 = vector.extract %76[0] : vector<1x4xf32>
%78 = vector.fma %77, %73, %arg12 : vector<4xf32>
%79 = vector.broadcast %67 : vector<1xf32> to vector<1x4xf32>
%80 = vector.extract %79[0] : vector<1x4xf32>
%81 = vector.fma %80, %74, %78 : vector<4xf32>
%82 = vector.broadcast %68 : vector<1xf32> to vector<1x4xf32>
%83 = vector.extract %82[0] : vector<1x4xf32>
%84 = vector.fma %83, %75, %81 : vector<4xf32>
%85 = vector.broadcast %69 : vector<1xf32> to vector<1x4xf32>
%86 = vector.extract %85[0] : vector<1x4xf32>
%87 = vector.fma %86, %73, %arg11 : vector<4xf32>
%88 = vector.broadcast %70 : vector<1xf32> to vector<1x4xf32>
%89 = vector.extract %88[0] : vector<1x4xf32>
%90 = vector.fma %89, %74, %87 : vector<4xf32>
%91 = vector.broadcast %71 : vector<1xf32> to vector<1x4xf32>
%92 = vector.extract %91[0] : vector<1x4xf32>
%93 = vector.fma %92, %75, %90 : vector<4xf32>
scf.yield %93, %84 : vector<4xf32>, vector<4xf32>
}
scf.yield %54#0, %54#1 : vector<4xf32>, vector<4xf32>
}
%39 = vector.transfer_write %38#1, %36[%c0, %c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>
%40 = vector.transfer_write %38#0, %39[%c0, %c0, %c1, %c0] {in_bounds = [true]} : vector<4xf32>, tensor<1x1x2x4xf32>