Skip to content

Instantly share code, notes, and snippets.

@antiagainst
Created April 2, 2021 00:37
Show Gist options
  • Save antiagainst/eb917325036f40e58b58aa0da9c4e3da to your computer and use it in GitHub Desktop.
Save antiagainst/eb917325036f40e58b58aa0da9c4e3da to your computer and use it in GitHub Desktop.
// *** IR Dump After mlir::iree_compiler::IREE::SIP::MaterializeReflectionAttrsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
return %0 : tensor<1x112x112x32xf32>
}
// *** IR Dump After mlir::iree_compiler::IREE::Flow::LegalizeInputTypesPass ***
module {
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
return %0 : tensor<1x112x112x32xf32>
}
}
// *** IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionDynamicDimsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
return %0 : tensor<1x112x112x32xf32>
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ConvertHLOToLinalgOnTensorsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%cst = constant 0.000000e+00 : f32
%1 = linalg.fill(%0, %cst) : tensor<1x112x112x32xf32>, f32 -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%1 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
return %2 : tensor<1x112x112x32xf32>
}
// *** IR Dump After LinalgFoldUnitExtentDims ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%cst = constant 0.000000e+00 : f32
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%1 = linalg.fill(%0, %cst) : tensor<1x112x112x32xf32>, f32 -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%1 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
return %2 : tensor<1x112x112x32xf32>
}
// *** IR Dump After mlir::iree_compiler::IREE::Flow::(anonymous namespace)::DispatchLinalgOnTensorsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = flow.dispatch.workgroups[%c32, %c112, %c112](%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c32_0 = constant 32 : index
%c112_1 = constant 112 : index
%c0 = constant 0 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%1 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %2 to %c112_1 step %3 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %4 to %c112_1 step %5 {
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %6 to %c32_0 step %7 {
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%workgroup_size_2, %arg5)
%10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%11 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%workgroup_size_1, %arg6)
%12 = flow.dispatch.tensor.load %arg2, offsets = [%c0, %8, %10, %c0], sizes = [%c1, %9, %11, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%13 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg7, %workgroup_size_0)
%14 = flow.dispatch.tensor.load %arg3, offsets = [%c0, %c0, %c0, %arg7], sizes = [%c3, %c3, %c3, %13], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%15 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg5, %workgroup_size_2)
%16 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg6, %workgroup_size_1)
%17 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg7, %workgroup_size_0)
%18 = subtensor %1[0, %arg5, %arg6, %arg7] [1, %15, %16, %17] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
%19 = linalg.fill(%18, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%20 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %14 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%19 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%21 = tensor.cast %20 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
%22 = tensor.cast %21 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %22, %arg4, offsets = [%c0, %arg5, %arg6, %arg7], sizes = [%c1, %15, %16, %17], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
return %0 : tensor<1x112x112x32xf32>
}
// *** IR Dump After Canonicalizer ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = flow.dispatch.workgroups[%c32, %c112, %c112](%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c32_0 = constant 32 : index
%c112_1 = constant 112 : index
%c0 = constant 0 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
%2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg5 = %1 to %c112_1 step %2 {
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg6 = %3 to %c112_1 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg7 = %5 to %c32_0 step %6 {
%7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
%8 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%workgroup_size_2, %arg5)
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
%10 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%workgroup_size_1, %arg6)
%11 = flow.dispatch.tensor.load %arg2, offsets = [%c0, %7, %9, %c0], sizes = [%c1, %8, %10, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%12 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg7, %workgroup_size_0)
%13 = flow.dispatch.tensor.load %arg3, offsets = [%c0, %c0, %c0, %arg7], sizes = [%c3, %c3, %c3, %12], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%14 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg5, %workgroup_size_2)
%15 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg6, %workgroup_size_1)
%16 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg7, %workgroup_size_0)
%17 = linalg.init_tensor [1, %14, %15, %16] : tensor<1x?x?x?xf32>
%18 = linalg.fill(%17, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%19 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%18 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %19, %arg4, offsets = [%c0, %arg5, %arg6, %arg7], sizes = [%c1, %14, %15, %16], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
flow.return
}
return %0 : tensor<1x112x112x32xf32>
}
// *** IR Dump After mlir::iree_compiler::IREE::Flow::OutlineDispatchRegions2Pass ***
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0) -> (d0 * 2)>
#map2 = affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>
#map3 = affine_map<(d0, d1) -> (d1, -d0 + 32)>
#map4 = affine_map<(d0, d1) -> (d1, -d0 + 112)>
module {
flow.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
flow.dispatch.entry @conv_dispatch_0 attributes {signature = (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>, workgroup_rank = 3 : index}
module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply #map1(%arg3)
%7 = affine.min #map2(%workgroup_size_2, %arg3)
%8 = affine.apply #map1(%arg4)
%9 = affine.min #map2(%workgroup_size_1, %arg4)
%10 = flow.dispatch.tensor.load %arg0, offsets = [%c0, %6, %8, %c0], sizes = [%c1, %7, %9, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min #map3(%arg5, %workgroup_size_0)
%12 = flow.dispatch.tensor.load %arg1, offsets = [%c0, %c0, %c0, %arg5], sizes = [%c3, %c3, %c3, %11], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min #map4(%arg3, %workgroup_size_2)
%14 = affine.min #map4(%arg4, %workgroup_size_1)
%15 = affine.min #map3(%arg5, %workgroup_size_0)
%16 = linalg.init_tensor [1, %13, %14, %15] : tensor<1x?x?x?xf32>
%17 = linalg.fill(%16, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%18 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%17 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %18, %arg2, offsets = [%c0, %arg3, %arg4, %arg5], sizes = [%c1, %13, %14, %15], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
return %0 : tensor<1x112x112x32xf32>
}
}
// *** IR Dump After mlir::iree_compiler::IREE::Flow::(anonymous namespace)::HoistUnstreamableOps ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
return %0 : tensor<1x112x112x32xf32>
}
// *** IR Dump After mlir::iree_compiler::IREE::Flow::FormStreamsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c112 = constant 112 : index
%c32 = constant 32 : index
%0 = flow.ex.stream.fragment(%c32, %c112, %arg0, %arg1) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%1 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %1 : tensor<1x112x112x32xf32>
}
return %0 : tensor<1x112x112x32xf32>
}
// *** IR Dump After Canonicalizer ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%0 = flow.ex.stream.fragment(%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%1 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %1 : tensor<1x112x112x32xf32>
}
return %0 : tensor<1x112x112x32xf32>
}
// *** IR Dump After Canonicalizer ***
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0) -> (d0 * 2)>
#map2 = affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>
#map3 = affine_map<(d0)[s0] -> (s0, -d0 + 32)>
#map4 = affine_map<(d0)[s0] -> (s0, -d0 + 112)>
module {
flow.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
flow.dispatch.entry @conv_dispatch_0 attributes {signature = (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>, workgroup_rank = 3 : index}
module {
func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
%cst = constant 0.000000e+00 : f32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c0 = constant 0 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
%workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
%workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
%workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
%workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
%workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
%workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
%workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
%workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
%0 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
%1 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
scf.for %arg3 = %0 to %c112 step %1 {
%2 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
%3 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
scf.for %arg4 = %2 to %c112 step %3 {
%4 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
%5 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
scf.for %arg5 = %4 to %c32 step %5 {
%6 = affine.apply #map1(%arg3)
%7 = affine.min #map2(%arg3)[%workgroup_size_2]
%8 = affine.apply #map1(%arg4)
%9 = affine.min #map2(%arg4)[%workgroup_size_1]
%10 = flow.dispatch.tensor.load %arg0, offsets = [%c0, %6, %8, %c0], sizes = [%c1, %7, %9, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%11 = affine.min #map3(%arg5)[%workgroup_size_0]
%12 = flow.dispatch.tensor.load %arg1, offsets = [%c0, %c0, %c0, %arg5], sizes = [%c3, %c3, %c3, %11], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%13 = affine.min #map4(%arg3)[%workgroup_size_2]
%14 = affine.min #map4(%arg4)[%workgroup_size_1]
%15 = affine.min #map3(%arg5)[%workgroup_size_0]
%16 = linalg.init_tensor [1, %13, %14, %15] : tensor<1x?x?x?xf32>
%17 = linalg.fill(%16, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%18 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%17 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %18, %arg2, offsets = [%c0, %arg3, %arg4, %arg5], sizes = [%c1, %13, %14, %15], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
}
}
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%0 = flow.ex.stream.fragment(%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%1 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %1 : tensor<1x112x112x32xf32>
}
return %0 : tensor<1x112x112x32xf32>
}
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeInterfacesPass ***
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0) -> (d0 * 2)>
#map2 = affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>
#map3 = affine_map<(d0)[s0] -> (s0, -d0 + 32)>
#map4 = affine_map<(d0)[s0] -> (s0, -d0 + 112)>
module {
hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
hal.interface @legacy_io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.target @vulkan_spirv, filter="vulkan*" {
hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()}
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() {
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply #map0()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply #map0()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply #map1(%arg0)
%10 = affine.min #map2(%arg0)[%workgroup_size_z]
%11 = affine.apply #map1(%arg1)
%12 = affine.min #map2(%arg1)[%workgroup_size_y]
%13 = flow.dispatch.tensor.load %0, offsets = [%c0, %9, %11, %c0], sizes = [%c1, %10, %12, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%14 = affine.min #map3(%arg2)[%workgroup_size_x]
%15 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0, %c0, %arg2], sizes = [%c3, %c3, %c3, %14], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%16 = affine.min #map4(%arg0)[%workgroup_size_z]
%17 = affine.min #map4(%arg1)[%workgroup_size_y]
%18 = affine.min #map3(%arg2)[%workgroup_size_x]
%19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
%20 = linalg.fill(%19, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
%21 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %21, %2, offsets = [%c0, %arg0, %arg1, %arg2], sizes = [%c1, %16, %17, %18], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%0 = flow.ex.stream.fragment(%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
(%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
%c112 = constant 112 : index
%c32 = constant 32 : index
%1 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
flow.return %1 : tensor<1x112x112x32xf32>
}
return %0 : tensor<1x112x112x32xf32>
}
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::LinalgBufferizePass ***
func @conv_dispatch_0() {
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%c32 = constant 32 : index
%c112 = constant 112 : index
%c1 = constant 1 : index
%c3 = constant 3 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[%c0, %12, %14, %c0] [%c1, %13, %15, %c3] [%c1, %c1, %c1, %c1] : memref<1x225x225x3xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
%17 = flow.dispatch.tensor.load %1, offsets = [%c0, %12, %14, %c0], sizes = [%c1, %13, %15, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
%18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%19 = memref.subview %2[%c0, %c0, %c0, %arg2] [%c3, %c3, %c3, %18] [%c1, %c1, %c1, %c1] : memref<3x3x3x32xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
%20 = flow.dispatch.tensor.load %3, offsets = [%c0, %c0, %c0, %arg2], sizes = [%c3, %c3, %c3, %18], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%22 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%23 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%24 = memref.subview %4[%c0, %arg0, %arg1, %arg2] [%c1, %21, %22, %23] [%c1, %c1, %c1, %c1] : memref<1x112x112x32xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
%25 = linalg.init_tensor [1, %21, %22, %23] : tensor<1x?x?x?xf32>
linalg.fill(%24, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, f32
%26 = linalg.fill(%25, %cst) {__internal_linalg_transform__ = "workgroup"} : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %19 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>) outs(%24 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>)
%27 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%17, %20 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%26 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
}
}
}
return
}
// *** IR Dump After Canonicalizer ***
func @conv_dispatch_0() {
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%22, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// *** IR Dump After CSE ***
func @conv_dispatch_0() {
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
%2 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%3 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
%4 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%5 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %6 to %c112 step %7 {
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %8 to %c112 step %9 {
%10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %10 to %c32 step %11 {
%12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg0)[%workgroup_size_z]
%14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg1)[%workgroup_size_y]
%16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%21 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %17] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%21, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%21 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::BufferAllocViewCleanUpPass ***
func @conv_dispatch_0() {
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_size_z = hal.interface.workgroup.size[2] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg0)[%workgroup_size_z]
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg1)[%workgroup_size_y]
%13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
%15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
%17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
%18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%18, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
Root op: linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, is_root_op, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
Queried workload size: 32, 112, 112, 1
Queried tile size: 16, 4, 4, 0
--- After concretizing hal.interface.workgroup ops ---
func @conv_dispatch_0() {
%c4 = constant 4 : index
%c16 = constant 16 : index
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%c32 = constant 32 : index
%c112 = constant 112 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %c4]
scf.for %arg0 = %3 to %c112 step %c112 {
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c4]
scf.for %arg1 = %4 to %c112 step %c112 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c16]
scf.for %arg2 = %5 to %c32 step %c32 {
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%7 = affine.min affine_map<(d0)[s0] -> (9, d0 * -2 + 225)>(%arg0)[%c4]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%9 = affine.min affine_map<(d0)[s0] -> (9, d0 * -2 + 225)>(%arg1)[%c4]
%10 = memref.subview %0[0, %6, %8, 0] [1, %7, %9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%11 = affine.min affine_map<(d0)[s0] -> (16, -d0 + 32)>(%arg2)[%c16]
%12 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %11] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%13 = affine.min affine_map<(d0)[s0] -> (4, -d0 + 112)>(%arg0)[%c4]
%14 = affine.min affine_map<(d0)[s0] -> (4, -d0 + 112)>(%arg1)[%c4]
%15 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %13, %14, %11] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%15, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%15 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
}
return
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ConcretizeTileAmongWorkgroupsPass ***
hal.executable.target @vulkan_spirv, filter="vulkan*" {
hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c2 = constant 2 : index
%c28 = constant 28 : index
%c28_0 = constant 28 : index
hal.return %c2, %c28, %c28_0 : index, index, index
}
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%c4 = constant 4 : index
%c16 = constant 16 : index
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %c4]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c4]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c16]
%6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%3)
%7 = affine.min affine_map<(d0)[s0] -> (9, d0 * -2 + 225)>(%3)[%c4]
%8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%4)
%9 = affine.min affine_map<(d0)[s0] -> (9, d0 * -2 + 225)>(%4)[%c4]
%10 = memref.subview %0[0, %6, %8, 0] [1, %7, %9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%11 = affine.min affine_map<(d0)[s0] -> (16, -d0 + 32)>(%5)[%c16]
%12 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, %11] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%13 = affine.min affine_map<(d0)[s0] -> (4, -d0 + 112)>(%3)[%c4]
%14 = affine.min affine_map<(d0)[s0] -> (4, -d0 + 112)>(%4)[%c4]
%15 = memref.subview %2[0, %3, %4, %5] [1, %13, %14, %11] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%15, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%15 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
--- IREE Linalg tile and fuse configuration ---
@func conv_dispatch_0: # workgroup sizes: [4, 4, 1]
linalg.fill : {0 : [0, 4, 4, 16], 1 : [], 2 : [0, 4, 1, 4], 3 : [0, 0, 0, 0, 1, 1, 4]}
linalg.conv_2d_input_nhwc_filter_hwcf : {0 : [0, 4, 4, 16], 1 : [], 2 : [0, 4, 1, 4], 3 : [0, 0, 0, 0, 1, 1, 4]}
--- After Second level Tiling ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.min affine_map<()[s0] -> (9, s0 * -8 + 225)>()[%workgroup_id_z]
%8 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%9 = affine.min affine_map<()[s0] -> (9, s0 * -8 + 225)>()[%workgroup_id_y]
%10 = memref.subview %0[0, %6, %8, 0] [1, %7, %9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%11 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%12 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%12, %cst) {__internal_linalg_transform__ = "workgroup", launch_info_key = "__op_num_0__"} : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, launch_info_key = "__op_num_0__", strides = dense<2> : tensor<2xi64>} ins(%10, %11 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%12 : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
return
}
--- After Third level Tiling ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.min affine_map<()[s0] -> (9, s0 * -8 + 225)>()[%workgroup_id_z]
%8 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%9 = affine.min affine_map<()[s0] -> (9, s0 * -8 + 225)>()[%workgroup_id_y]
%10 = memref.subview %0[0, %6, %8, 0] [1, %7, %9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%11 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%12 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%13 = "gpu.thread_id"() {dimension = "x"} : () -> index
%14 = "gpu.thread_id"() {dimension = "y"} : () -> index
%15 = "gpu.thread_id"() {dimension = "z"} : () -> index
%16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%15]
%17 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%18 = memref.subview %12[0, %16, %14, %17] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%18, %cst) {__internal_linalg_transform__ = "vectorize", launch_info_key = "__op_num_0__"} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
%19 = "gpu.thread_id"() {dimension = "x"} : () -> index
%20 = "gpu.thread_id"() {dimension = "y"} : () -> index
%21 = "gpu.thread_id"() {dimension = "z"} : () -> index
%22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%21]
%23 = affine.min affine_map<()[s0, s1] -> (9, s0 * -8 + s1)>()[%21, %7]
%24 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%20]
%25 = affine.min affine_map<()[s0, s1] -> (3, s0 * -2 + s1)>()[%20, %9]
%26 = memref.subview %10[0, %22, %24, 0] [1, %23, %25, 3] [1, 1, 1, 1] : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
%28 = memref.subview %11[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
%30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
%31 = memref.subview %12[0, %29, %20, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "tile_conv_filter", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, launch_info_key = "__op_num_0__", strides = dense<2> : tensor<2xi64>} ins(%26, %28 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%31 : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
return
}
--- After tiling convolution filter ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%cst = constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
linalg.fill(%16, %cst) {__internal_linalg_transform__ = "vectorize", launch_info_key = "__op_num_0__"} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
%17 = "gpu.thread_id"() {dimension = "x"} : () -> index
%18 = "gpu.thread_id"() {dimension = "y"} : () -> index
%19 = "gpu.thread_id"() {dimension = "z"} : () -> index
%20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%19]
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
%22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%23 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%25 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
scf.for %arg0 = %c0 to %c3 step %c1 {
scf.for %arg1 = %c0 to %c3 step %c1 {
%28 = memref.subview %22[0, %arg0, %arg1, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%29 = memref.subview %24[%arg0, %arg1, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "vectorize", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, launch_info_key = "__op_num_0__", strides = dense<2> : tensor<2xi64>} ins(%28, %29 : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%27 : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
}
}
return
}
--- After Vectorization ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst_0, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x4x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = "gpu.thread_id"() {dimension = "x"} : () -> index
%18 = "gpu.thread_id"() {dimension = "y"} : () -> index
%19 = "gpu.thread_id"() {dimension = "z"} : () -> index
%20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%19]
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
%22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%23 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%25 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
scf.for %arg0 = %c0 to %c3 step %c1 {
scf.for %arg1 = %c0 to %c3 step %c1 {
%28 = memref.subview %22[0, %arg0, %arg1, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%29 = memref.subview %24[%arg0, %arg1, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%30 = vector.transfer_read %29[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<3x4xf32>
%31 = vector.extract_strided_slice %30 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%32 = vector.extract_strided_slice %30 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%33 = vector.extract_strided_slice %30 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
%34 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%35 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.extract_strided_slice %34 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%37 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %36, %31, %35 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%38 = vector.extract_strided_slice %34 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%39 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %38, %32, %37 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%40 = vector.extract_strided_slice %34 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %33, %39 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
vector.transfer_write %41, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%42 = vector.transfer_read %28[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%43 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%44 = vector.extract_strided_slice %42 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %31, %43 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%46 = vector.extract_strided_slice %42 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %32, %45 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%48 = vector.extract_strided_slice %42 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %33, %47 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
vector.transfer_write %49, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%50 = vector.transfer_read %28[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%51 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%52 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%53 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %31, %51 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%54 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%55 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %54, %32, %53 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%56 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %56, %33, %55 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
vector.transfer_write %57, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%58 = vector.transfer_read %28[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%59 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%60 = vector.extract_strided_slice %58 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %31, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%62 = vector.extract_strided_slice %58 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %32, %61 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%64 = vector.extract_strided_slice %58 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %64, %33, %63 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
vector.transfer_write %65, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
}
return
}
--- After Vector Unroll ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = vector.extract_strided_slice %cst_0 {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%18 = vector.extract_strided_slice %cst_0 {offsets = [0, 1, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%19 = vector.extract_strided_slice %cst_0 {offsets = [0, 2, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%20 = vector.extract_strided_slice %cst_0 {offsets = [0, 3, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
vector.transfer_write %17, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %18, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %19, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %20, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%21 = "gpu.thread_id"() {dimension = "x"} : () -> index
%22 = "gpu.thread_id"() {dimension = "y"} : () -> index
%23 = "gpu.thread_id"() {dimension = "z"} : () -> index
%24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%23]
%25 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%22]
%26 = memref.subview %8[0, %24, %25, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
%28 = memref.subview %9[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%23]
%30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
%31 = memref.subview %10[0, %29, %22, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
scf.for %arg0 = %c0 to %c3 step %c1 {
scf.for %arg1 = %c0 to %c3 step %c1 {
%32 = memref.subview %26[0, %arg0, %arg1, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%33 = memref.subview %28[%arg0, %arg1, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%38 = vector.transfer_read %31[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%39 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %34, %38 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%41 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%42 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %35, %40 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%43 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %43, %36, %42 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
vector.transfer_write %44, %31[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%45 = vector.transfer_read %32[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%46 = vector.transfer_read %31[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%47 = vector.extract_strided_slice %45 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %34, %46 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%49 = vector.extract_strided_slice %45 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %35, %48 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%51 = vector.extract_strided_slice %45 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %36, %50 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
vector.transfer_write %52, %31[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%53 = vector.transfer_read %32[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%54 = vector.transfer_read %31[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%55 = vector.extract_strided_slice %53 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %34, %54 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%57 = vector.extract_strided_slice %53 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%58 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %35, %56 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%59 = vector.extract_strided_slice %53 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%60 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %36, %58 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
vector.transfer_write %60, %31[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%61 = vector.transfer_read %32[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%62 = vector.transfer_read %31[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%63 = vector.extract_strided_slice %61 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%64 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %63, %34, %62 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%65 = vector.extract_strided_slice %61 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %35, %64 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%67 = vector.extract_strided_slice %61 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %67, %36, %66 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
vector.transfer_write %68, %31[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
}
}
return
}
--- After Hoisting ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = vector.extract_strided_slice %cst_0 {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%18 = vector.extract_strided_slice %cst_0 {offsets = [0, 1, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%19 = vector.extract_strided_slice %cst_0 {offsets = [0, 2, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%20 = vector.extract_strided_slice %cst_0 {offsets = [0, 3, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
vector.transfer_write %17, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %18, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %19, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %20, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%21 = "gpu.thread_id"() {dimension = "x"} : () -> index
%22 = "gpu.thread_id"() {dimension = "y"} : () -> index
%23 = "gpu.thread_id"() {dimension = "z"} : () -> index
%24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%23]
%25 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%22]
%26 = memref.subview %8[0, %24, %25, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
%28 = memref.subview %9[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%23]
%30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
%31 = memref.subview %10[0, %29, %22, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%32 = vector.transfer_read %31[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%33 = vector.transfer_read %31[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%34 = vector.transfer_read %31[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %31[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%36:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %32, %arg2 = %33, %arg3 = %34, %arg4 = %35) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%37:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%38 = memref.subview %26[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%39 = memref.subview %28[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%40 = vector.transfer_read %39[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%41 = vector.transfer_read %39[%c0, %c0, %c1, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%42 = vector.transfer_read %39[%c0, %c0, %c2, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%43 = vector.transfer_read %38[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%44 = vector.extract_strided_slice %43 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %40, %arg6 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%46 = vector.extract_strided_slice %43 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %41, %45 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%48 = vector.extract_strided_slice %43 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %42, %47 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%50 = vector.transfer_read %38[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%51 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %40, %arg7 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%53 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %53, %41, %52 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%55 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %42, %54 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%57 = vector.transfer_read %38[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%58 = vector.extract_strided_slice %57 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %40, %arg8 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%60 = vector.extract_strided_slice %57 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %41, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%62 = vector.extract_strided_slice %57 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %42, %61 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%64 = vector.transfer_read %38[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%65 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %40, %arg9 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%67 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %67, %41, %66 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%69 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%70 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %69, %42, %68 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
scf.yield %49, %56, %63, %70 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %37#0, %37#1, %37#2, %37#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %36#3, %31[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %36#2, %31[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %36#1, %31[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %36#0, %31[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
return
}
--- After generalization ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = vector.extract_strided_slice %cst_0 {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%18 = vector.extract_strided_slice %cst_0 {offsets = [0, 1, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%19 = vector.extract_strided_slice %cst_0 {offsets = [0, 2, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%20 = vector.extract_strided_slice %cst_0 {offsets = [0, 3, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
vector.transfer_write %17, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %18, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %19, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %20, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%21 = "gpu.thread_id"() {dimension = "x"} : () -> index
%22 = "gpu.thread_id"() {dimension = "y"} : () -> index
%23 = "gpu.thread_id"() {dimension = "z"} : () -> index
%24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%23]
%25 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%22]
%26 = memref.subview %8[0, %24, %25, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
%28 = memref.subview %9[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%23]
%30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
%31 = memref.subview %10[0, %29, %22, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%32 = vector.transfer_read %31[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%33 = vector.transfer_read %31[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%34 = vector.transfer_read %31[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %31[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%36:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %32, %arg2 = %33, %arg3 = %34, %arg4 = %35) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%37:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%38 = memref.subview %26[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%39 = memref.subview %28[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%40 = vector.transfer_read %39[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%41 = vector.transfer_read %39[%c0, %c0, %c1, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%42 = vector.transfer_read %39[%c0, %c0, %c2, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%43 = vector.transfer_read %38[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%44 = vector.extract_strided_slice %43 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %40, %arg6 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%46 = vector.extract_strided_slice %43 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %41, %45 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%48 = vector.extract_strided_slice %43 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %42, %47 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%50 = vector.transfer_read %38[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%51 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %40, %arg7 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%53 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %53, %41, %52 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%55 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %42, %54 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%57 = vector.transfer_read %38[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%58 = vector.extract_strided_slice %57 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %40, %arg8 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%60 = vector.extract_strided_slice %57 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %41, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%62 = vector.extract_strided_slice %57 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %42, %61 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%64 = vector.transfer_read %38[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%65 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %40, %arg9 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%67 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %67, %41, %66 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%69 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%70 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %69, %42, %68 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
scf.yield %49, %56, %63, %70 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %37#0, %37#1, %37#2, %37#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %36#3, %31[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %36#2, %31[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %36#1, %31[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %36#0, %31[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
return
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::TileAndVectorizeInOneWorkgroupPass ***
hal.executable.target @vulkan_spirv, filter="vulkan*" {
hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c2 = constant 2 : index
%c28 = constant 28 : index
%c28_0 = constant 28 : index
hal.return %c2, %c28, %c28_0 : index, index, index
}
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst = constant 0.000000e+00 : f32
%cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = vector.extract_strided_slice %cst_0 {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%18 = vector.extract_strided_slice %cst_0 {offsets = [0, 1, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%19 = vector.extract_strided_slice %cst_0 {offsets = [0, 2, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
%20 = vector.extract_strided_slice %cst_0 {offsets = [0, 3, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
vector.transfer_write %17, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %18, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %19, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %20, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%21 = "gpu.thread_id"() {dimension = "x"} : () -> index
%22 = "gpu.thread_id"() {dimension = "y"} : () -> index
%23 = "gpu.thread_id"() {dimension = "z"} : () -> index
%24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%23]
%25 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%22]
%26 = memref.subview %8[0, %24, %25, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
%28 = memref.subview %9[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%23]
%30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
%31 = memref.subview %10[0, %29, %22, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%32 = vector.transfer_read %31[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%33 = vector.transfer_read %31[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%34 = vector.transfer_read %31[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%35 = vector.transfer_read %31[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%36:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %32, %arg2 = %33, %arg3 = %34, %arg4 = %35) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%37:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%38 = memref.subview %26[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%39 = memref.subview %28[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%40 = vector.transfer_read %39[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%41 = vector.transfer_read %39[%c0, %c0, %c1, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%42 = vector.transfer_read %39[%c0, %c0, %c2, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%43 = vector.transfer_read %38[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%44 = vector.extract_strided_slice %43 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %40, %arg6 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%46 = vector.extract_strided_slice %43 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %41, %45 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%48 = vector.extract_strided_slice %43 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %42, %47 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%50 = vector.transfer_read %38[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%51 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %40, %arg7 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%53 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %53, %41, %52 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%55 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %42, %54 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%57 = vector.transfer_read %38[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%58 = vector.extract_strided_slice %57 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %40, %arg8 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%60 = vector.extract_strided_slice %57 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %41, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%62 = vector.extract_strided_slice %57 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %42, %61 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%64 = vector.transfer_read %38[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%65 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %40, %arg9 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%67 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %67, %41, %66 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%69 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%70 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %69, %42, %68 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
scf.yield %49, %56, %63, %70 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %37#0, %37#1, %37#2, %37#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %36#3, %31[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %36#2, %31[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %36#1, %31[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %36#0, %31[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = "gpu.thread_id"() {dimension = "x"} : () -> index
%18 = "gpu.thread_id"() {dimension = "y"} : () -> index
%19 = "gpu.thread_id"() {dimension = "z"} : () -> index
%20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%19]
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
%22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%23 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%25 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%31 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%32:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %28, %arg2 = %29, %arg3 = %30, %arg4 = %31) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%34 = memref.subview %22[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%35 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%40 = vector.extract_strided_slice %39 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %36, %arg6 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%42 = vector.extract_strided_slice %39 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%43 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %37, %41 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%44 = vector.extract_strided_slice %39 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %38, %43 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%46 = vector.transfer_read %34[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%47 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %36, %arg7 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%49 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %37, %48 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%51 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %38, %50 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%53 = vector.transfer_read %34[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%54 = vector.extract_strided_slice %53 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%55 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %54, %36, %arg8 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%56 = vector.extract_strided_slice %53 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%57 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %56, %37, %55 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%58 = vector.extract_strided_slice %53 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %38, %57 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%60 = vector.transfer_read %34[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%61 = vector.extract_strided_slice %60 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %61, %36, %arg9 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%63 = vector.extract_strided_slice %60 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%64 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %63, %37, %62 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
%65 = vector.extract_strided_slice %60 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
%66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %38, %64 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
scf.yield %45, %52, %59, %66 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %32#3, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#2, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#1, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#0, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ConvertVectorToGPUPass ***
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
%7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
%15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = "gpu.thread_id"() {dimension = "x"} : () -> index
%18 = "gpu.thread_id"() {dimension = "y"} : () -> index
%19 = "gpu.thread_id"() {dimension = "z"} : () -> index
%20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%19]
%21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
%22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%23 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%25 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
%26 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
%27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%31 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%32:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %28, %arg2 = %29, %arg3 = %30, %arg4 = %31) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%34 = memref.subview %22[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%35 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%40 = vector.extract %39[0, 0] : vector<1x3xf32>
%41 = vector.broadcast %40 : f32 to vector<4xf32>
%42 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%43 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
%44 = mulf %41, %42 : vector<4xf32>
%45 = addf %44, %43 : vector<4xf32>
%46 = vector.extract %39[0, 1] : vector<1x3xf32>
%47 = vector.broadcast %46 : f32 to vector<4xf32>
%48 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%49 = mulf %47, %48 : vector<4xf32>
%50 = addf %49, %45 : vector<4xf32>
%51 = vector.extract %39[0, 2] : vector<1x3xf32>
%52 = vector.broadcast %51 : f32 to vector<4xf32>
%53 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%54 = mulf %52, %53 : vector<4xf32>
%55 = addf %54, %50 : vector<4xf32>
%56 = vector.shape_cast %55 : vector<4xf32> to vector<1x4xf32>
%57 = vector.transfer_read %34[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%58 = vector.extract %57[0, 0] : vector<1x3xf32>
%59 = vector.broadcast %58 : f32 to vector<4xf32>
%60 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%61 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%62 = mulf %59, %60 : vector<4xf32>
%63 = addf %62, %61 : vector<4xf32>
%64 = vector.extract %57[0, 1] : vector<1x3xf32>
%65 = vector.broadcast %64 : f32 to vector<4xf32>
%66 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%67 = mulf %65, %66 : vector<4xf32>
%68 = addf %67, %63 : vector<4xf32>
%69 = vector.extract %57[0, 2] : vector<1x3xf32>
%70 = vector.broadcast %69 : f32 to vector<4xf32>
%71 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%72 = mulf %70, %71 : vector<4xf32>
%73 = addf %72, %68 : vector<4xf32>
%74 = vector.shape_cast %73 : vector<4xf32> to vector<1x4xf32>
%75 = vector.transfer_read %34[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%76 = vector.extract %75[0, 0] : vector<1x3xf32>
%77 = vector.broadcast %76 : f32 to vector<4xf32>
%78 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%79 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%80 = mulf %77, %78 : vector<4xf32>
%81 = addf %80, %79 : vector<4xf32>
%82 = vector.extract %75[0, 1] : vector<1x3xf32>
%83 = vector.broadcast %82 : f32 to vector<4xf32>
%84 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%85 = mulf %83, %84 : vector<4xf32>
%86 = addf %85, %81 : vector<4xf32>
%87 = vector.extract %75[0, 2] : vector<1x3xf32>
%88 = vector.broadcast %87 : f32 to vector<4xf32>
%89 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%90 = mulf %88, %89 : vector<4xf32>
%91 = addf %90, %86 : vector<4xf32>
%92 = vector.shape_cast %91 : vector<4xf32> to vector<1x4xf32>
%93 = vector.transfer_read %34[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%94 = vector.extract %93[0, 0] : vector<1x3xf32>
%95 = vector.broadcast %94 : f32 to vector<4xf32>
%96 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%97 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%98 = mulf %95, %96 : vector<4xf32>
%99 = addf %98, %97 : vector<4xf32>
%100 = vector.extract %93[0, 1] : vector<1x3xf32>
%101 = vector.broadcast %100 : f32 to vector<4xf32>
%102 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%103 = mulf %101, %102 : vector<4xf32>
%104 = addf %103, %99 : vector<4xf32>
%105 = vector.extract %93[0, 2] : vector<1x3xf32>
%106 = vector.broadcast %105 : f32 to vector<4xf32>
%107 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%108 = mulf %106, %107 : vector<4xf32>
%109 = addf %108, %104 : vector<4xf32>
%110 = vector.shape_cast %109 : vector<4xf32> to vector<1x4xf32>
scf.yield %56, %74, %92, %110 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %32#3, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#2, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#1, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#0, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
return
}
// *** IR Dump After ConvertAffineToStandard ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%c4_1 = constant 4 : index
%3 = muli %workgroup_id_z, %c4_1 : index
%c4_2 = constant 4 : index
%4 = muli %workgroup_id_y, %c4_2 : index
%c16 = constant 16 : index
%5 = muli %workgroup_id_x, %c16 : index
%c8 = constant 8 : index
%6 = muli %workgroup_id_z, %c8 : index
%c8_3 = constant 8 : index
%7 = muli %workgroup_id_y, %c8_3 : index
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%c4_4 = constant 4 : index
%14 = muli %13, %c4_4 : index
%c4_5 = constant 4 : index
%15 = muli %11, %c4_5 : index
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = "gpu.thread_id"() {dimension = "x"} : () -> index
%18 = "gpu.thread_id"() {dimension = "y"} : () -> index
%19 = "gpu.thread_id"() {dimension = "z"} : () -> index
%c8_6 = constant 8 : index
%20 = muli %19, %c8_6 : index
%c2_7 = constant 2 : index
%21 = muli %18, %c2_7 : index
%22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%c4_8 = constant 4 : index
%23 = muli %17, %c4_8 : index
%24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%c4_9 = constant 4 : index
%25 = muli %19, %c4_9 : index
%c4_10 = constant 4 : index
%26 = muli %17, %c4_10 : index
%27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%31 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%32:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %28, %arg2 = %29, %arg3 = %30, %arg4 = %31) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%34 = memref.subview %22[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%35 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%40 = vector.extract %39[0, 0] : vector<1x3xf32>
%41 = vector.broadcast %40 : f32 to vector<4xf32>
%42 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%43 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
%44 = mulf %41, %42 : vector<4xf32>
%45 = addf %44, %43 : vector<4xf32>
%46 = vector.extract %39[0, 1] : vector<1x3xf32>
%47 = vector.broadcast %46 : f32 to vector<4xf32>
%48 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%49 = mulf %47, %48 : vector<4xf32>
%50 = addf %49, %45 : vector<4xf32>
%51 = vector.extract %39[0, 2] : vector<1x3xf32>
%52 = vector.broadcast %51 : f32 to vector<4xf32>
%53 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%54 = mulf %52, %53 : vector<4xf32>
%55 = addf %54, %50 : vector<4xf32>
%56 = vector.shape_cast %55 : vector<4xf32> to vector<1x4xf32>
%57 = vector.transfer_read %34[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%58 = vector.extract %57[0, 0] : vector<1x3xf32>
%59 = vector.broadcast %58 : f32 to vector<4xf32>
%60 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%61 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%62 = mulf %59, %60 : vector<4xf32>
%63 = addf %62, %61 : vector<4xf32>
%64 = vector.extract %57[0, 1] : vector<1x3xf32>
%65 = vector.broadcast %64 : f32 to vector<4xf32>
%66 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%67 = mulf %65, %66 : vector<4xf32>
%68 = addf %67, %63 : vector<4xf32>
%69 = vector.extract %57[0, 2] : vector<1x3xf32>
%70 = vector.broadcast %69 : f32 to vector<4xf32>
%71 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%72 = mulf %70, %71 : vector<4xf32>
%73 = addf %72, %68 : vector<4xf32>
%74 = vector.shape_cast %73 : vector<4xf32> to vector<1x4xf32>
%75 = vector.transfer_read %34[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%76 = vector.extract %75[0, 0] : vector<1x3xf32>
%77 = vector.broadcast %76 : f32 to vector<4xf32>
%78 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%79 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%80 = mulf %77, %78 : vector<4xf32>
%81 = addf %80, %79 : vector<4xf32>
%82 = vector.extract %75[0, 1] : vector<1x3xf32>
%83 = vector.broadcast %82 : f32 to vector<4xf32>
%84 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%85 = mulf %83, %84 : vector<4xf32>
%86 = addf %85, %81 : vector<4xf32>
%87 = vector.extract %75[0, 2] : vector<1x3xf32>
%88 = vector.broadcast %87 : f32 to vector<4xf32>
%89 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%90 = mulf %88, %89 : vector<4xf32>
%91 = addf %90, %86 : vector<4xf32>
%92 = vector.shape_cast %91 : vector<4xf32> to vector<1x4xf32>
%93 = vector.transfer_read %34[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%94 = vector.extract %93[0, 0] : vector<1x3xf32>
%95 = vector.broadcast %94 : f32 to vector<4xf32>
%96 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%97 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%98 = mulf %95, %96 : vector<4xf32>
%99 = addf %98, %97 : vector<4xf32>
%100 = vector.extract %93[0, 1] : vector<1x3xf32>
%101 = vector.broadcast %100 : f32 to vector<4xf32>
%102 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%103 = mulf %101, %102 : vector<4xf32>
%104 = addf %103, %99 : vector<4xf32>
%105 = vector.extract %93[0, 2] : vector<1x3xf32>
%106 = vector.broadcast %105 : f32 to vector<4xf32>
%107 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%108 = mulf %106, %107 : vector<4xf32>
%109 = addf %108, %104 : vector<4xf32>
%110 = vector.shape_cast %109 : vector<4xf32> to vector<1x4xf32>
scf.yield %56, %74, %92, %110 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %32#3, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#2, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#1, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#0, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = muli %13, %c4 : index
%15 = muli %11, %c4 : index
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = "gpu.thread_id"() {dimension = "x"} : () -> index
%18 = "gpu.thread_id"() {dimension = "y"} : () -> index
%19 = "gpu.thread_id"() {dimension = "z"} : () -> index
%20 = muli %19, %c8 : index
%21 = muli %18, %c2 : index
%22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%23 = muli %17, %c4 : index
%24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%25 = muli %19, %c4 : index
%26 = muli %17, %c4 : index
%27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%29 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%30 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%31 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%32:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %28, %arg2 = %29, %arg3 = %30, %arg4 = %31) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%34 = memref.subview %22[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%35 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%40 = vector.extract %39[0, 0] : vector<1x3xf32>
%41 = vector.broadcast %40 : f32 to vector<4xf32>
%42 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%43 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
%44 = mulf %41, %42 : vector<4xf32>
%45 = addf %44, %43 : vector<4xf32>
%46 = vector.extract %39[0, 1] : vector<1x3xf32>
%47 = vector.broadcast %46 : f32 to vector<4xf32>
%48 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%49 = mulf %47, %48 : vector<4xf32>
%50 = addf %49, %45 : vector<4xf32>
%51 = vector.extract %39[0, 2] : vector<1x3xf32>
%52 = vector.broadcast %51 : f32 to vector<4xf32>
%53 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%54 = mulf %52, %53 : vector<4xf32>
%55 = addf %54, %50 : vector<4xf32>
%56 = vector.shape_cast %55 : vector<4xf32> to vector<1x4xf32>
%57 = vector.transfer_read %34[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%58 = vector.extract %57[0, 0] : vector<1x3xf32>
%59 = vector.broadcast %58 : f32 to vector<4xf32>
%60 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%61 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%62 = mulf %59, %60 : vector<4xf32>
%63 = addf %62, %61 : vector<4xf32>
%64 = vector.extract %57[0, 1] : vector<1x3xf32>
%65 = vector.broadcast %64 : f32 to vector<4xf32>
%66 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%67 = mulf %65, %66 : vector<4xf32>
%68 = addf %67, %63 : vector<4xf32>
%69 = vector.extract %57[0, 2] : vector<1x3xf32>
%70 = vector.broadcast %69 : f32 to vector<4xf32>
%71 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%72 = mulf %70, %71 : vector<4xf32>
%73 = addf %72, %68 : vector<4xf32>
%74 = vector.shape_cast %73 : vector<4xf32> to vector<1x4xf32>
%75 = vector.transfer_read %34[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%76 = vector.extract %75[0, 0] : vector<1x3xf32>
%77 = vector.broadcast %76 : f32 to vector<4xf32>
%78 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%79 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%80 = mulf %77, %78 : vector<4xf32>
%81 = addf %80, %79 : vector<4xf32>
%82 = vector.extract %75[0, 1] : vector<1x3xf32>
%83 = vector.broadcast %82 : f32 to vector<4xf32>
%84 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%85 = mulf %83, %84 : vector<4xf32>
%86 = addf %85, %81 : vector<4xf32>
%87 = vector.extract %75[0, 2] : vector<1x3xf32>
%88 = vector.broadcast %87 : f32 to vector<4xf32>
%89 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%90 = mulf %88, %89 : vector<4xf32>
%91 = addf %90, %86 : vector<4xf32>
%92 = vector.shape_cast %91 : vector<4xf32> to vector<1x4xf32>
%93 = vector.transfer_read %34[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%94 = vector.extract %93[0, 0] : vector<1x3xf32>
%95 = vector.broadcast %94 : f32 to vector<4xf32>
%96 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
%97 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%98 = mulf %95, %96 : vector<4xf32>
%99 = addf %98, %97 : vector<4xf32>
%100 = vector.extract %93[0, 1] : vector<1x3xf32>
%101 = vector.broadcast %100 : f32 to vector<4xf32>
%102 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
%103 = mulf %101, %102 : vector<4xf32>
%104 = addf %103, %99 : vector<4xf32>
%105 = vector.extract %93[0, 2] : vector<1x3xf32>
%106 = vector.broadcast %105 : f32 to vector<4xf32>
%107 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
%108 = mulf %106, %107 : vector<4xf32>
%109 = addf %108, %104 : vector<4xf32>
%110 = vector.shape_cast %109 : vector<4xf32> to vector<1x4xf32>
scf.yield %56, %74, %92, %110 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %32#3, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#2, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#1, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %32#0, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = muli %13, %c4 : index
%15 = muli %11, %c4 : index
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = muli %13, %c8 : index
%18 = muli %12, %c2 : index
%19 = memref.subview %8[0, %17, %18, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%20 = memref.subview %9[0, 0, 0, %15] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%21 = vector.transfer_read %16[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%22 = vector.transfer_read %16[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%23 = vector.transfer_read %16[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%24 = vector.transfer_read %16[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
%25:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %21, %arg2 = %22, %arg3 = %23, %arg4 = %24) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%26:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%27 = memref.subview %19[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%28 = memref.subview %20[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%30 = vector.transfer_read %28[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%31 = vector.transfer_read %28[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
%32 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%33 = vector.extract %32[0, 0] : vector<1x3xf32>
%34 = vector.broadcast %33 : f32 to vector<4xf32>
%35 = vector.shape_cast %29 : vector<1x4xf32> to vector<4xf32>
%36 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
%37 = mulf %34, %35 : vector<4xf32>
%38 = addf %37, %36 : vector<4xf32>
%39 = vector.extract %32[0, 1] : vector<1x3xf32>
%40 = vector.broadcast %39 : f32 to vector<4xf32>
%41 = vector.shape_cast %30 : vector<1x4xf32> to vector<4xf32>
%42 = mulf %40, %41 : vector<4xf32>
%43 = addf %42, %38 : vector<4xf32>
%44 = vector.extract %32[0, 2] : vector<1x3xf32>
%45 = vector.broadcast %44 : f32 to vector<4xf32>
%46 = vector.shape_cast %31 : vector<1x4xf32> to vector<4xf32>
%47 = mulf %45, %46 : vector<4xf32>
%48 = addf %47, %43 : vector<4xf32>
%49 = vector.shape_cast %48 : vector<4xf32> to vector<1x4xf32>
%50 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%51 = vector.extract %50[0, 0] : vector<1x3xf32>
%52 = vector.broadcast %51 : f32 to vector<4xf32>
%53 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%54 = mulf %52, %35 : vector<4xf32>
%55 = addf %54, %53 : vector<4xf32>
%56 = vector.extract %50[0, 1] : vector<1x3xf32>
%57 = vector.broadcast %56 : f32 to vector<4xf32>
%58 = mulf %57, %41 : vector<4xf32>
%59 = addf %58, %55 : vector<4xf32>
%60 = vector.extract %50[0, 2] : vector<1x3xf32>
%61 = vector.broadcast %60 : f32 to vector<4xf32>
%62 = mulf %61, %46 : vector<4xf32>
%63 = addf %62, %59 : vector<4xf32>
%64 = vector.shape_cast %63 : vector<4xf32> to vector<1x4xf32>
%65 = vector.transfer_read %27[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%66 = vector.extract %65[0, 0] : vector<1x3xf32>
%67 = vector.broadcast %66 : f32 to vector<4xf32>
%68 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%69 = mulf %67, %35 : vector<4xf32>
%70 = addf %69, %68 : vector<4xf32>
%71 = vector.extract %65[0, 1] : vector<1x3xf32>
%72 = vector.broadcast %71 : f32 to vector<4xf32>
%73 = mulf %72, %41 : vector<4xf32>
%74 = addf %73, %70 : vector<4xf32>
%75 = vector.extract %65[0, 2] : vector<1x3xf32>
%76 = vector.broadcast %75 : f32 to vector<4xf32>
%77 = mulf %76, %46 : vector<4xf32>
%78 = addf %77, %74 : vector<4xf32>
%79 = vector.shape_cast %78 : vector<4xf32> to vector<1x4xf32>
%80 = vector.transfer_read %27[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
%81 = vector.extract %80[0, 0] : vector<1x3xf32>
%82 = vector.broadcast %81 : f32 to vector<4xf32>
%83 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%84 = mulf %82, %35 : vector<4xf32>
%85 = addf %84, %83 : vector<4xf32>
%86 = vector.extract %80[0, 1] : vector<1x3xf32>
%87 = vector.broadcast %86 : f32 to vector<4xf32>
%88 = mulf %87, %41 : vector<4xf32>
%89 = addf %88, %85 : vector<4xf32>
%90 = vector.extract %80[0, 2] : vector<1x3xf32>
%91 = vector.broadcast %90 : f32 to vector<4xf32>
%92 = mulf %91, %46 : vector<4xf32>
%93 = addf %92, %89 : vector<4xf32>
%94 = vector.shape_cast %93 : vector<4xf32> to vector<1x4xf32>
scf.yield %49, %64, %79, %94 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %26#0, %26#1, %26#2, %26#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
vector.transfer_write %25#3, %16[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %25#2, %16[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %25#1, %16[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
vector.transfer_write %25#0, %16[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::VectorTransferOptimizationPass ***
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%11 = "gpu.thread_id"() {dimension = "x"} : () -> index
%12 = "gpu.thread_id"() {dimension = "y"} : () -> index
%13 = "gpu.thread_id"() {dimension = "z"} : () -> index
%14 = muli %13, %c4 : index
%15 = muli %11, %c4 : index
%16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%17 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<4xf32>
%18 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<4xf32>
%19 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<4xf32>
%20 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<4xf32>
%21 = muli %13, %c8 : index
%22 = muli %12, %c2 : index
%23 = memref.subview %8[0, %21, %22, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%24 = memref.subview %9[0, 0, 0, %15] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%25 = vector.shape_cast %17 : vector<4xf32> to vector<1x4xf32>
%26 = vector.shape_cast %18 : vector<4xf32> to vector<1x4xf32>
%27 = vector.shape_cast %19 : vector<4xf32> to vector<1x4xf32>
%28 = vector.shape_cast %20 : vector<4xf32> to vector<1x4xf32>
%29:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %25, %arg2 = %26, %arg3 = %27, %arg4 = %28) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%34:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%35 = memref.subview %23[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
%36 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
%37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {masked = [false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
%38 = vector.transfer_read %36[%c0, %c0, %c1, %c0], %cst_0 {masked = [false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
%39 = vector.transfer_read %36[%c0, %c0, %c2, %c0], %cst_0 {masked = [false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
%40 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%41 = vector.extract %40[0] : vector<3xf32>
%42 = vector.broadcast %41 : f32 to vector<4xf32>
%43 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
%44 = mulf %42, %37 : vector<4xf32>
%45 = addf %44, %43 : vector<4xf32>
%46 = vector.extract %40[1] : vector<3xf32>
%47 = vector.broadcast %46 : f32 to vector<4xf32>
%48 = mulf %47, %38 : vector<4xf32>
%49 = addf %48, %45 : vector<4xf32>
%50 = vector.extract %40[2] : vector<3xf32>
%51 = vector.broadcast %50 : f32 to vector<4xf32>
%52 = mulf %51, %39 : vector<4xf32>
%53 = addf %52, %49 : vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
%55 = vector.transfer_read %35[%c0, %c2, %c0, %c0], %cst_0 {masked = [false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%56 = vector.extract %55[0] : vector<3xf32>
%57 = vector.broadcast %56 : f32 to vector<4xf32>
%58 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%59 = mulf %57, %37 : vector<4xf32>
%60 = addf %59, %58 : vector<4xf32>
%61 = vector.extract %55[1] : vector<3xf32>
%62 = vector.broadcast %61 : f32 to vector<4xf32>
%63 = mulf %62, %38 : vector<4xf32>
%64 = addf %63, %60 : vector<4xf32>
%65 = vector.extract %55[2] : vector<3xf32>
%66 = vector.broadcast %65 : f32 to vector<4xf32>
%67 = mulf %66, %39 : vector<4xf32>
%68 = addf %67, %64 : vector<4xf32>
%69 = vector.shape_cast %68 : vector<4xf32> to vector<1x4xf32>
%70 = vector.transfer_read %35[%c0, %c4, %c0, %c0], %cst_0 {masked = [false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%71 = vector.extract %70[0] : vector<3xf32>
%72 = vector.broadcast %71 : f32 to vector<4xf32>
%73 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%74 = mulf %72, %37 : vector<4xf32>
%75 = addf %74, %73 : vector<4xf32>
%76 = vector.extract %70[1] : vector<3xf32>
%77 = vector.broadcast %76 : f32 to vector<4xf32>
%78 = mulf %77, %38 : vector<4xf32>
%79 = addf %78, %75 : vector<4xf32>
%80 = vector.extract %70[2] : vector<3xf32>
%81 = vector.broadcast %80 : f32 to vector<4xf32>
%82 = mulf %81, %39 : vector<4xf32>
%83 = addf %82, %79 : vector<4xf32>
%84 = vector.shape_cast %83 : vector<4xf32> to vector<1x4xf32>
%85 = vector.transfer_read %35[%c0, %c6, %c0, %c0], %cst_0 {masked = [false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
%86 = vector.extract %85[0] : vector<3xf32>
%87 = vector.broadcast %86 : f32 to vector<4xf32>
%88 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%89 = mulf %87, %37 : vector<4xf32>
%90 = addf %89, %88 : vector<4xf32>
%91 = vector.extract %85[1] : vector<3xf32>
%92 = vector.broadcast %91 : f32 to vector<4xf32>
%93 = mulf %92, %38 : vector<4xf32>
%94 = addf %93, %90 : vector<4xf32>
%95 = vector.extract %85[2] : vector<3xf32>
%96 = vector.broadcast %95 : f32 to vector<4xf32>
%97 = mulf %96, %39 : vector<4xf32>
%98 = addf %97, %94 : vector<4xf32>
%99 = vector.shape_cast %98 : vector<4xf32> to vector<1x4xf32>
scf.yield %54, %69, %84, %99 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %34#0, %34#1, %34#2, %34#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%30 = vector.shape_cast %29#3 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %30, %16[%c0, %c3, %c0, %c0] {masked = [false]} : vector<4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%31 = vector.shape_cast %29#2 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %31, %16[%c0, %c2, %c0, %c0] {masked = [false]} : vector<4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%32 = vector.shape_cast %29#1 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %32, %16[%c0, %c1, %c0, %c0] {masked = [false]} : vector<4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
%33 = vector.shape_cast %29#0 : vector<1x4xf32> to vector<4xf32>
vector.transfer_write %33, %16[%c0, %c0, %c0, %c0] {masked = [false]} : vector<4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
return
}
// *** IR Dump After LegalizeStandardForSPIRV ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = "gpu.thread_id"() {dimension = "x"} : () -> index
%9 = "gpu.thread_id"() {dimension = "y"} : () -> index
%10 = "gpu.thread_id"() {dimension = "z"} : () -> index
%11 = muli %10, %c4 : index
%12 = muli %8, %c4 : index
%13 = muli %10, %c8 : index
%14 = muli %9, %c2 : index
%15 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<1x4xf32>
%16 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<1x4xf32>
%17 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<1x4xf32>
%18 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<1x4xf32>
%19:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %15, %arg2 = %16, %arg3 = %17, %arg4 = %18) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%39:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%40 = addi %5, %12 : index
%41 = vector.transfer_read %1[%arg0, %arg5, %c0, %40], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
%42 = addi %5, %12 : index
%43 = vector.transfer_read %1[%arg0, %arg5, %c1, %42], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
%44 = addi %5, %12 : index
%45 = vector.transfer_read %1[%arg0, %arg5, %c2, %44], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
%46 = addi %13, %arg0 : index
%47 = addi %14, %arg5 : index
%48 = addi %6, %46 : index
%49 = addi %7, %47 : index
%50 = vector.transfer_read %0[%c0, %48, %49, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%51 = vector.extract %50[0] : vector<3xf32>
%52 = vector.broadcast %51 : f32 to vector<4xf32>
%53 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
%54 = mulf %52, %41 : vector<4xf32>
%55 = addf %54, %53 : vector<4xf32>
%56 = vector.extract %50[1] : vector<3xf32>
%57 = vector.broadcast %56 : f32 to vector<4xf32>
%58 = mulf %57, %43 : vector<4xf32>
%59 = addf %58, %55 : vector<4xf32>
%60 = vector.extract %50[2] : vector<3xf32>
%61 = vector.broadcast %60 : f32 to vector<4xf32>
%62 = mulf %61, %45 : vector<4xf32>
%63 = addf %62, %59 : vector<4xf32>
%64 = vector.shape_cast %63 : vector<4xf32> to vector<1x4xf32>
%65 = addi %arg0, %c2 : index
%66 = addi %13, %65 : index
%67 = addi %14, %arg5 : index
%68 = addi %6, %66 : index
%69 = addi %7, %67 : index
%70 = vector.transfer_read %0[%c0, %68, %69, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%71 = vector.extract %70[0] : vector<3xf32>
%72 = vector.broadcast %71 : f32 to vector<4xf32>
%73 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%74 = mulf %72, %41 : vector<4xf32>
%75 = addf %74, %73 : vector<4xf32>
%76 = vector.extract %70[1] : vector<3xf32>
%77 = vector.broadcast %76 : f32 to vector<4xf32>
%78 = mulf %77, %43 : vector<4xf32>
%79 = addf %78, %75 : vector<4xf32>
%80 = vector.extract %70[2] : vector<3xf32>
%81 = vector.broadcast %80 : f32 to vector<4xf32>
%82 = mulf %81, %45 : vector<4xf32>
%83 = addf %82, %79 : vector<4xf32>
%84 = vector.shape_cast %83 : vector<4xf32> to vector<1x4xf32>
%85 = addi %arg0, %c4 : index
%86 = addi %13, %85 : index
%87 = addi %14, %arg5 : index
%88 = addi %6, %86 : index
%89 = addi %7, %87 : index
%90 = vector.transfer_read %0[%c0, %88, %89, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%91 = vector.extract %90[0] : vector<3xf32>
%92 = vector.broadcast %91 : f32 to vector<4xf32>
%93 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%94 = mulf %92, %41 : vector<4xf32>
%95 = addf %94, %93 : vector<4xf32>
%96 = vector.extract %90[1] : vector<3xf32>
%97 = vector.broadcast %96 : f32 to vector<4xf32>
%98 = mulf %97, %43 : vector<4xf32>
%99 = addf %98, %95 : vector<4xf32>
%100 = vector.extract %90[2] : vector<3xf32>
%101 = vector.broadcast %100 : f32 to vector<4xf32>
%102 = mulf %101, %45 : vector<4xf32>
%103 = addf %102, %99 : vector<4xf32>
%104 = vector.shape_cast %103 : vector<4xf32> to vector<1x4xf32>
%105 = addi %arg0, %c6 : index
%106 = addi %13, %105 : index
%107 = addi %14, %arg5 : index
%108 = addi %6, %106 : index
%109 = addi %7, %107 : index
%110 = vector.transfer_read %0[%c0, %108, %109, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%111 = vector.extract %110[0] : vector<3xf32>
%112 = vector.broadcast %111 : f32 to vector<4xf32>
%113 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%114 = mulf %112, %41 : vector<4xf32>
%115 = addf %114, %113 : vector<4xf32>
%116 = vector.extract %110[1] : vector<3xf32>
%117 = vector.broadcast %116 : f32 to vector<4xf32>
%118 = mulf %117, %43 : vector<4xf32>
%119 = addf %118, %115 : vector<4xf32>
%120 = vector.extract %110[2] : vector<3xf32>
%121 = vector.broadcast %120 : f32 to vector<4xf32>
%122 = mulf %121, %45 : vector<4xf32>
%123 = addf %122, %119 : vector<4xf32>
%124 = vector.shape_cast %123 : vector<4xf32> to vector<1x4xf32>
scf.yield %64, %84, %104, %124 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %39#0, %39#1, %39#2, %39#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%20 = vector.shape_cast %19#3 : vector<1x4xf32> to vector<4xf32>
%21 = addi %11, %c3 : index
%22 = addi %3, %21 : index
%23 = addi %4, %9 : index
%24 = addi %5, %12 : index
vector.transfer_write %20, %2[%c0, %22, %23, %24] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
%25 = vector.shape_cast %19#2 : vector<1x4xf32> to vector<4xf32>
%26 = addi %11, %c2 : index
%27 = addi %3, %26 : index
%28 = addi %4, %9 : index
%29 = addi %5, %12 : index
vector.transfer_write %25, %2[%c0, %27, %28, %29] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
%30 = vector.shape_cast %19#1 : vector<1x4xf32> to vector<4xf32>
%31 = addi %11, %c1 : index
%32 = addi %3, %31 : index
%33 = addi %4, %9 : index
%34 = addi %5, %12 : index
vector.transfer_write %30, %2[%c0, %32, %33, %34] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
%35 = vector.shape_cast %19#0 : vector<1x4xf32> to vector<4xf32>
%36 = addi %3, %11 : index
%37 = addi %4, %9 : index
%38 = addi %5, %12 : index
vector.transfer_write %35, %2[%c0, %36, %37, %38] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = "gpu.thread_id"() {dimension = "x"} : () -> index
%9 = "gpu.thread_id"() {dimension = "y"} : () -> index
%10 = "gpu.thread_id"() {dimension = "z"} : () -> index
%11 = muli %10, %c4 : index
%12 = muli %8, %c4 : index
%13 = muli %10, %c8 : index
%14 = muli %9, %c2 : index
%15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%35:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%36 = addi %5, %12 : index
%37 = vector.transfer_read %1[%arg0, %arg5, %c0, %36], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
%38 = addi %5, %12 : index
%39 = vector.transfer_read %1[%arg0, %arg5, %c1, %38], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
%40 = addi %5, %12 : index
%41 = vector.transfer_read %1[%arg0, %arg5, %c2, %40], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
%42 = addi %13, %arg0 : index
%43 = addi %14, %arg5 : index
%44 = addi %6, %42 : index
%45 = addi %7, %43 : index
%46 = vector.transfer_read %0[%c0, %44, %45, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%47 = vector.extract %46[0] : vector<3xf32>
%48 = vector.broadcast %47 : f32 to vector<4xf32>
%49 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
%50 = mulf %48, %37 : vector<4xf32>
%51 = addf %50, %49 : vector<4xf32>
%52 = vector.extract %46[1] : vector<3xf32>
%53 = vector.broadcast %52 : f32 to vector<4xf32>
%54 = mulf %53, %39 : vector<4xf32>
%55 = addf %54, %51 : vector<4xf32>
%56 = vector.extract %46[2] : vector<3xf32>
%57 = vector.broadcast %56 : f32 to vector<4xf32>
%58 = mulf %57, %41 : vector<4xf32>
%59 = addf %58, %55 : vector<4xf32>
%60 = vector.shape_cast %59 : vector<4xf32> to vector<1x4xf32>
%61 = addi %arg0, %c2 : index
%62 = addi %13, %61 : index
%63 = addi %14, %arg5 : index
%64 = addi %6, %62 : index
%65 = addi %7, %63 : index
%66 = vector.transfer_read %0[%c0, %64, %65, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%67 = vector.extract %66[0] : vector<3xf32>
%68 = vector.broadcast %67 : f32 to vector<4xf32>
%69 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%70 = mulf %68, %37 : vector<4xf32>
%71 = addf %70, %69 : vector<4xf32>
%72 = vector.extract %66[1] : vector<3xf32>
%73 = vector.broadcast %72 : f32 to vector<4xf32>
%74 = mulf %73, %39 : vector<4xf32>
%75 = addf %74, %71 : vector<4xf32>
%76 = vector.extract %66[2] : vector<3xf32>
%77 = vector.broadcast %76 : f32 to vector<4xf32>
%78 = mulf %77, %41 : vector<4xf32>
%79 = addf %78, %75 : vector<4xf32>
%80 = vector.shape_cast %79 : vector<4xf32> to vector<1x4xf32>
%81 = addi %arg0, %c4 : index
%82 = addi %13, %81 : index
%83 = addi %14, %arg5 : index
%84 = addi %6, %82 : index
%85 = addi %7, %83 : index
%86 = vector.transfer_read %0[%c0, %84, %85, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%87 = vector.extract %86[0] : vector<3xf32>
%88 = vector.broadcast %87 : f32 to vector<4xf32>
%89 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%90 = mulf %88, %37 : vector<4xf32>
%91 = addf %90, %89 : vector<4xf32>
%92 = vector.extract %86[1] : vector<3xf32>
%93 = vector.broadcast %92 : f32 to vector<4xf32>
%94 = mulf %93, %39 : vector<4xf32>
%95 = addf %94, %91 : vector<4xf32>
%96 = vector.extract %86[2] : vector<3xf32>
%97 = vector.broadcast %96 : f32 to vector<4xf32>
%98 = mulf %97, %41 : vector<4xf32>
%99 = addf %98, %95 : vector<4xf32>
%100 = vector.shape_cast %99 : vector<4xf32> to vector<1x4xf32>
%101 = addi %arg0, %c6 : index
%102 = addi %13, %101 : index
%103 = addi %14, %arg5 : index
%104 = addi %6, %102 : index
%105 = addi %7, %103 : index
%106 = vector.transfer_read %0[%c0, %104, %105, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%107 = vector.extract %106[0] : vector<3xf32>
%108 = vector.broadcast %107 : f32 to vector<4xf32>
%109 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%110 = mulf %108, %37 : vector<4xf32>
%111 = addf %110, %109 : vector<4xf32>
%112 = vector.extract %106[1] : vector<3xf32>
%113 = vector.broadcast %112 : f32 to vector<4xf32>
%114 = mulf %113, %39 : vector<4xf32>
%115 = addf %114, %111 : vector<4xf32>
%116 = vector.extract %106[2] : vector<3xf32>
%117 = vector.broadcast %116 : f32 to vector<4xf32>
%118 = mulf %117, %41 : vector<4xf32>
%119 = addf %118, %115 : vector<4xf32>
%120 = vector.shape_cast %119 : vector<4xf32> to vector<1x4xf32>
scf.yield %60, %80, %100, %120 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %35#0, %35#1, %35#2, %35#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%16 = vector.shape_cast %15#3 : vector<1x4xf32> to vector<4xf32>
%17 = addi %11, %c3 : index
%18 = addi %3, %17 : index
%19 = addi %4, %9 : index
%20 = addi %5, %12 : index
vector.transfer_write %16, %2[%c0, %18, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
%21 = vector.shape_cast %15#2 : vector<1x4xf32> to vector<4xf32>
%22 = addi %11, %c2 : index
%23 = addi %3, %22 : index
%24 = addi %4, %9 : index
%25 = addi %5, %12 : index
vector.transfer_write %21, %2[%c0, %23, %24, %25] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
%26 = vector.shape_cast %15#1 : vector<1x4xf32> to vector<4xf32>
%27 = addi %11, %c1 : index
%28 = addi %3, %27 : index
%29 = addi %4, %9 : index
%30 = addi %5, %12 : index
vector.transfer_write %26, %2[%c0, %28, %29, %30] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
%31 = vector.shape_cast %15#0 : vector<1x4xf32> to vector<4xf32>
%32 = addi %3, %11 : index
%33 = addi %4, %9 : index
%34 = addi %5, %12 : index
vector.transfer_write %31, %2[%c0, %32, %33, %34] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = "gpu.thread_id"() {dimension = "x"} : () -> index
%9 = "gpu.thread_id"() {dimension = "y"} : () -> index
%10 = "gpu.thread_id"() {dimension = "z"} : () -> index
%11 = muli %10, %c4 : index
%12 = muli %8, %c4 : index
%13 = muli %10, %c8 : index
%14 = muli %9, %c2 : index
%15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%29:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%30 = addi %5, %12 : index
%31 = vector.transfer_read %1[%arg0, %arg5, %c0, %30], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
%32 = vector.transfer_read %1[%arg0, %arg5, %c1, %30], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
%33 = vector.transfer_read %1[%arg0, %arg5, %c2, %30], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
%34 = addi %13, %arg0 : index
%35 = addi %14, %arg5 : index
%36 = addi %6, %34 : index
%37 = addi %7, %35 : index
%38 = vector.transfer_read %0[%c0, %36, %37, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%39 = vector.extract %38[0] : vector<3xf32>
%40 = vector.broadcast %39 : f32 to vector<4xf32>
%41 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
%42 = mulf %40, %31 : vector<4xf32>
%43 = addf %42, %41 : vector<4xf32>
%44 = vector.extract %38[1] : vector<3xf32>
%45 = vector.broadcast %44 : f32 to vector<4xf32>
%46 = mulf %45, %32 : vector<4xf32>
%47 = addf %46, %43 : vector<4xf32>
%48 = vector.extract %38[2] : vector<3xf32>
%49 = vector.broadcast %48 : f32 to vector<4xf32>
%50 = mulf %49, %33 : vector<4xf32>
%51 = addf %50, %47 : vector<4xf32>
%52 = vector.shape_cast %51 : vector<4xf32> to vector<1x4xf32>
%53 = addi %arg0, %c2 : index
%54 = addi %13, %53 : index
%55 = addi %6, %54 : index
%56 = vector.transfer_read %0[%c0, %55, %37, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%57 = vector.extract %56[0] : vector<3xf32>
%58 = vector.broadcast %57 : f32 to vector<4xf32>
%59 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%60 = mulf %58, %31 : vector<4xf32>
%61 = addf %60, %59 : vector<4xf32>
%62 = vector.extract %56[1] : vector<3xf32>
%63 = vector.broadcast %62 : f32 to vector<4xf32>
%64 = mulf %63, %32 : vector<4xf32>
%65 = addf %64, %61 : vector<4xf32>
%66 = vector.extract %56[2] : vector<3xf32>
%67 = vector.broadcast %66 : f32 to vector<4xf32>
%68 = mulf %67, %33 : vector<4xf32>
%69 = addf %68, %65 : vector<4xf32>
%70 = vector.shape_cast %69 : vector<4xf32> to vector<1x4xf32>
%71 = addi %arg0, %c4 : index
%72 = addi %13, %71 : index
%73 = addi %6, %72 : index
%74 = vector.transfer_read %0[%c0, %73, %37, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%75 = vector.extract %74[0] : vector<3xf32>
%76 = vector.broadcast %75 : f32 to vector<4xf32>
%77 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%78 = mulf %76, %31 : vector<4xf32>
%79 = addf %78, %77 : vector<4xf32>
%80 = vector.extract %74[1] : vector<3xf32>
%81 = vector.broadcast %80 : f32 to vector<4xf32>
%82 = mulf %81, %32 : vector<4xf32>
%83 = addf %82, %79 : vector<4xf32>
%84 = vector.extract %74[2] : vector<3xf32>
%85 = vector.broadcast %84 : f32 to vector<4xf32>
%86 = mulf %85, %33 : vector<4xf32>
%87 = addf %86, %83 : vector<4xf32>
%88 = vector.shape_cast %87 : vector<4xf32> to vector<1x4xf32>
%89 = addi %arg0, %c6 : index
%90 = addi %13, %89 : index
%91 = addi %6, %90 : index
%92 = vector.transfer_read %0[%c0, %91, %37, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%93 = vector.extract %92[0] : vector<3xf32>
%94 = vector.broadcast %93 : f32 to vector<4xf32>
%95 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%96 = mulf %94, %31 : vector<4xf32>
%97 = addf %96, %95 : vector<4xf32>
%98 = vector.extract %92[1] : vector<3xf32>
%99 = vector.broadcast %98 : f32 to vector<4xf32>
%100 = mulf %99, %32 : vector<4xf32>
%101 = addf %100, %97 : vector<4xf32>
%102 = vector.extract %92[2] : vector<3xf32>
%103 = vector.broadcast %102 : f32 to vector<4xf32>
%104 = mulf %103, %33 : vector<4xf32>
%105 = addf %104, %101 : vector<4xf32>
%106 = vector.shape_cast %105 : vector<4xf32> to vector<1x4xf32>
scf.yield %52, %70, %88, %106 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %29#0, %29#1, %29#2, %29#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%16 = vector.shape_cast %15#3 : vector<1x4xf32> to vector<4xf32>
%17 = addi %11, %c3 : index
%18 = addi %3, %17 : index
%19 = addi %4, %9 : index
%20 = addi %5, %12 : index
vector.transfer_write %16, %2[%c0, %18, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
%21 = vector.shape_cast %15#2 : vector<1x4xf32> to vector<4xf32>
%22 = addi %11, %c2 : index
%23 = addi %3, %22 : index
vector.transfer_write %21, %2[%c0, %23, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
%24 = vector.shape_cast %15#1 : vector<1x4xf32> to vector<4xf32>
%25 = addi %11, %c1 : index
%26 = addi %3, %25 : index
vector.transfer_write %24, %2[%c0, %26, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
%27 = vector.shape_cast %15#0 : vector<1x4xf32> to vector<4xf32>
%28 = addi %3, %11 : index
vector.transfer_write %27, %2[%c0, %28, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::VectorizeMemRefPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x8xvector<4xf32>>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x8xvector<4xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = "gpu.thread_id"() {dimension = "x"} : () -> index
%9 = "gpu.thread_id"() {dimension = "y"} : () -> index
%10 = "gpu.thread_id"() {dimension = "z"} : () -> index
%11 = muli %10, %c4 : index
%12 = muli %8, %c4 : index
%13 = muli %10, %c8 : index
%14 = muli %9, %c2 : index
%15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%41:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
%42 = addi %5, %12 : index
%c4_5 = constant 4 : index
%43 = divi_signed %42, %c4_5 : index
%44 = memref.load %1[%arg0, %arg5, %c0, %43] : memref<3x3x3x8xvector<4xf32>>
%45 = vector.bitcast %44 : vector<4xf32> to vector<4xf32>
%46 = vector.shape_cast %45 : vector<4xf32> to vector<4xf32>
%c4_6 = constant 4 : index
%47 = divi_signed %42, %c4_6 : index
%48 = memref.load %1[%arg0, %arg5, %c1, %47] : memref<3x3x3x8xvector<4xf32>>
%49 = vector.bitcast %48 : vector<4xf32> to vector<4xf32>
%50 = vector.shape_cast %49 : vector<4xf32> to vector<4xf32>
%c4_7 = constant 4 : index
%51 = divi_signed %42, %c4_7 : index
%52 = memref.load %1[%arg0, %arg5, %c2, %51] : memref<3x3x3x8xvector<4xf32>>
%53 = vector.bitcast %52 : vector<4xf32> to vector<4xf32>
%54 = vector.shape_cast %53 : vector<4xf32> to vector<4xf32>
%55 = addi %13, %arg0 : index
%56 = addi %14, %arg5 : index
%57 = addi %6, %55 : index
%58 = addi %7, %56 : index
%59 = vector.transfer_read %0[%c0, %57, %58, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%60 = vector.extract %59[0] : vector<3xf32>
%61 = vector.broadcast %60 : f32 to vector<4xf32>
%62 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
%63 = mulf %61, %46 : vector<4xf32>
%64 = addf %63, %62 : vector<4xf32>
%65 = vector.extract %59[1] : vector<3xf32>
%66 = vector.broadcast %65 : f32 to vector<4xf32>
%67 = mulf %66, %50 : vector<4xf32>
%68 = addf %67, %64 : vector<4xf32>
%69 = vector.extract %59[2] : vector<3xf32>
%70 = vector.broadcast %69 : f32 to vector<4xf32>
%71 = mulf %70, %54 : vector<4xf32>
%72 = addf %71, %68 : vector<4xf32>
%73 = vector.shape_cast %72 : vector<4xf32> to vector<1x4xf32>
%74 = addi %arg0, %c2 : index
%75 = addi %13, %74 : index
%76 = addi %6, %75 : index
%77 = vector.transfer_read %0[%c0, %76, %58, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%78 = vector.extract %77[0] : vector<3xf32>
%79 = vector.broadcast %78 : f32 to vector<4xf32>
%80 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
%81 = mulf %79, %46 : vector<4xf32>
%82 = addf %81, %80 : vector<4xf32>
%83 = vector.extract %77[1] : vector<3xf32>
%84 = vector.broadcast %83 : f32 to vector<4xf32>
%85 = mulf %84, %50 : vector<4xf32>
%86 = addf %85, %82 : vector<4xf32>
%87 = vector.extract %77[2] : vector<3xf32>
%88 = vector.broadcast %87 : f32 to vector<4xf32>
%89 = mulf %88, %54 : vector<4xf32>
%90 = addf %89, %86 : vector<4xf32>
%91 = vector.shape_cast %90 : vector<4xf32> to vector<1x4xf32>
%92 = addi %arg0, %c4 : index
%93 = addi %13, %92 : index
%94 = addi %6, %93 : index
%95 = vector.transfer_read %0[%c0, %94, %58, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%96 = vector.extract %95[0] : vector<3xf32>
%97 = vector.broadcast %96 : f32 to vector<4xf32>
%98 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
%99 = mulf %97, %46 : vector<4xf32>
%100 = addf %99, %98 : vector<4xf32>
%101 = vector.extract %95[1] : vector<3xf32>
%102 = vector.broadcast %101 : f32 to vector<4xf32>
%103 = mulf %102, %50 : vector<4xf32>
%104 = addf %103, %100 : vector<4xf32>
%105 = vector.extract %95[2] : vector<3xf32>
%106 = vector.broadcast %105 : f32 to vector<4xf32>
%107 = mulf %106, %54 : vector<4xf32>
%108 = addf %107, %104 : vector<4xf32>
%109 = vector.shape_cast %108 : vector<4xf32> to vector<1x4xf32>
%110 = addi %arg0, %c6 : index
%111 = addi %13, %110 : index
%112 = addi %6, %111 : index
%113 = vector.transfer_read %0[%c0, %112, %58, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%114 = vector.extract %113[0] : vector<3xf32>
%115 = vector.broadcast %114 : f32 to vector<4xf32>
%116 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
%117 = mulf %115, %46 : vector<4xf32>
%118 = addf %117, %116 : vector<4xf32>
%119 = vector.extract %113[1] : vector<3xf32>
%120 = vector.broadcast %119 : f32 to vector<4xf32>
%121 = mulf %120, %50 : vector<4xf32>
%122 = addf %121, %118 : vector<4xf32>
%123 = vector.extract %113[2] : vector<3xf32>
%124 = vector.broadcast %123 : f32 to vector<4xf32>
%125 = mulf %124, %54 : vector<4xf32>
%126 = addf %125, %122 : vector<4xf32>
%127 = vector.shape_cast %126 : vector<4xf32> to vector<1x4xf32>
scf.yield %73, %91, %109, %127 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
scf.yield %41#0, %41#1, %41#2, %41#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
}
%16 = vector.shape_cast %15#3 : vector<1x4xf32> to vector<4xf32>
%17 = addi %11, %c3 : index
%18 = addi %3, %17 : index
%19 = addi %4, %9 : index
%20 = addi %5, %12 : index
%c4_1 = constant 4 : index
%21 = divi_signed %20, %c4_1 : index
%22 = vector.shape_cast %16 : vector<4xf32> to vector<4xf32>
%23 = vector.bitcast %22 : vector<4xf32> to vector<4xf32>
memref.store %23, %2[%c0, %18, %19, %21] : memref<1x112x112x8xvector<4xf32>>
%24 = vector.shape_cast %15#2 : vector<1x4xf32> to vector<4xf32>
%25 = addi %11, %c2 : index
%26 = addi %3, %25 : index
%c4_2 = constant 4 : index
%27 = divi_signed %20, %c4_2 : index
%28 = vector.shape_cast %24 : vector<4xf32> to vector<4xf32>
%29 = vector.bitcast %28 : vector<4xf32> to vector<4xf32>
memref.store %29, %2[%c0, %26, %19, %27] : memref<1x112x112x8xvector<4xf32>>
%30 = vector.shape_cast %15#1 : vector<1x4xf32> to vector<4xf32>
%31 = addi %11, %c1 : index
%32 = addi %3, %31 : index
%c4_3 = constant 4 : index
%33 = divi_signed %20, %c4_3 : index
%34 = vector.shape_cast %30 : vector<4xf32> to vector<4xf32>
%35 = vector.bitcast %34 : vector<4xf32> to vector<4xf32>
memref.store %35, %2[%c0, %32, %19, %33] : memref<1x112x112x8xvector<4xf32>>
%36 = vector.shape_cast %15#0 : vector<1x4xf32> to vector<4xf32>
%37 = addi %3, %11 : index
%c4_4 = constant 4 : index
%38 = divi_signed %20, %c4_4 : index
%39 = vector.shape_cast %36 : vector<4xf32> to vector<4xf32>
%40 = vector.bitcast %39 : vector<4xf32> to vector<4xf32>
memref.store %40, %2[%c0, %37, %19, %38] : memref<1x112x112x8xvector<4xf32>>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ForOpCanonicalizationPass ***
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<1x4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x8xvector<4xf32>>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x8xvector<4xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = "gpu.thread_id"() {dimension = "x"} : () -> index
%9 = "gpu.thread_id"() {dimension = "y"} : () -> index
%10 = "gpu.thread_id"() {dimension = "z"} : () -> index
%11 = muli %10, %c4 : index
%12 = muli %8, %c4 : index
%13 = muli %10, %c8 : index
%14 = muli %9, %c2 : index
%15 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%16 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%17 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%18 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
%19:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %15, %arg2 = %16, %arg3 = %17, %arg4 = %18) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%34 = addi %5, %12 : index
%35 = divi_signed %34, %c4 : index
%36 = memref.load %1[%arg0, %arg5, %c0, %35] : memref<3x3x3x8xvector<4xf32>>
%37 = divi_signed %34, %c4 : index
%38 = memref.load %1[%arg0, %arg5, %c1, %37] : memref<3x3x3x8xvector<4xf32>>
%39 = divi_signed %34, %c4 : index
%40 = memref.load %1[%arg0, %arg5, %c2, %39] : memref<3x3x3x8xvector<4xf32>>
%41 = addi %13, %arg0 : index
%42 = addi %14, %arg5 : index
%43 = addi %6, %41 : index
%44 = addi %7, %42 : index
%45 = vector.transfer_read %0[%c0, %43, %44, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%46 = vector.extract %45[0] : vector<3xf32>
%47 = vector.broadcast %46 : f32 to vector<4xf32>
%48 = mulf %47, %36 : vector<4xf32>
%49 = addf %48, %arg6 : vector<4xf32>
%50 = vector.extract %45[1] : vector<3xf32>
%51 = vector.broadcast %50 : f32 to vector<4xf32>
%52 = mulf %51, %38 : vector<4xf32>
%53 = addf %52, %49 : vector<4xf32>
%54 = vector.extract %45[2] : vector<3xf32>
%55 = vector.broadcast %54 : f32 to vector<4xf32>
%56 = mulf %55, %40 : vector<4xf32>
%57 = addf %56, %53 : vector<4xf32>
%58 = addi %arg0, %c2 : index
%59 = addi %13, %58 : index
%60 = addi %6, %59 : index
%61 = vector.transfer_read %0[%c0, %60, %44, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%62 = vector.extract %61[0] : vector<3xf32>
%63 = vector.broadcast %62 : f32 to vector<4xf32>
%64 = mulf %63, %36 : vector<4xf32>
%65 = addf %64, %arg7 : vector<4xf32>
%66 = vector.extract %61[1] : vector<3xf32>
%67 = vector.broadcast %66 : f32 to vector<4xf32>
%68 = mulf %67, %38 : vector<4xf32>
%69 = addf %68, %65 : vector<4xf32>
%70 = vector.extract %61[2] : vector<3xf32>
%71 = vector.broadcast %70 : f32 to vector<4xf32>
%72 = mulf %71, %40 : vector<4xf32>
%73 = addf %72, %69 : vector<4xf32>
%74 = addi %arg0, %c4 : index
%75 = addi %13, %74 : index
%76 = addi %6, %75 : index
%77 = vector.transfer_read %0[%c0, %76, %44, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%78 = vector.extract %77[0] : vector<3xf32>
%79 = vector.broadcast %78 : f32 to vector<4xf32>
%80 = mulf %79, %36 : vector<4xf32>
%81 = addf %80, %arg8 : vector<4xf32>
%82 = vector.extract %77[1] : vector<3xf32>
%83 = vector.broadcast %82 : f32 to vector<4xf32>
%84 = mulf %83, %38 : vector<4xf32>
%85 = addf %84, %81 : vector<4xf32>
%86 = vector.extract %77[2] : vector<3xf32>
%87 = vector.broadcast %86 : f32 to vector<4xf32>
%88 = mulf %87, %40 : vector<4xf32>
%89 = addf %88, %85 : vector<4xf32>
%90 = addi %arg0, %c6 : index
%91 = addi %13, %90 : index
%92 = addi %6, %91 : index
%93 = vector.transfer_read %0[%c0, %92, %44, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%94 = vector.extract %93[0] : vector<3xf32>
%95 = vector.broadcast %94 : f32 to vector<4xf32>
%96 = mulf %95, %36 : vector<4xf32>
%97 = addf %96, %arg9 : vector<4xf32>
%98 = vector.extract %93[1] : vector<3xf32>
%99 = vector.broadcast %98 : f32 to vector<4xf32>
%100 = mulf %99, %38 : vector<4xf32>
%101 = addf %100, %97 : vector<4xf32>
%102 = vector.extract %93[2] : vector<3xf32>
%103 = vector.broadcast %102 : f32 to vector<4xf32>
%104 = mulf %103, %40 : vector<4xf32>
%105 = addf %104, %101 : vector<4xf32>
scf.yield %57, %73, %89, %105 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %33#0, %33#1, %33#2, %33#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%20 = addi %11, %c3 : index
%21 = addi %3, %20 : index
%22 = addi %4, %9 : index
%23 = addi %5, %12 : index
%24 = divi_signed %23, %c4 : index
memref.store %19#3, %2[%c0, %21, %22, %24] : memref<1x112x112x8xvector<4xf32>>
%25 = addi %11, %c2 : index
%26 = addi %3, %25 : index
%27 = divi_signed %23, %c4 : index
memref.store %19#2, %2[%c0, %26, %22, %27] : memref<1x112x112x8xvector<4xf32>>
%28 = addi %11, %c1 : index
%29 = addi %3, %28 : index
%30 = divi_signed %23, %c4 : index
memref.store %19#1, %2[%c0, %29, %22, %30] : memref<1x112x112x8xvector<4xf32>>
%31 = addi %3, %11 : index
%32 = divi_signed %23, %c4 : index
memref.store %19#0, %2[%c0, %31, %22, %32] : memref<1x112x112x8xvector<4xf32>>
return
}
// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x8xvector<4xf32>>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x8xvector<4xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = "gpu.thread_id"() {dimension = "x"} : () -> index
%9 = "gpu.thread_id"() {dimension = "y"} : () -> index
%10 = "gpu.thread_id"() {dimension = "z"} : () -> index
%11 = muli %10, %c4 : index
%12 = muli %8, %c4 : index
%13 = muli %10, %c8 : index
%14 = muli %9, %c2 : index
%15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%29:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%30 = addi %5, %12 : index
%31 = divi_signed %30, %c4 : index
%32 = memref.load %1[%arg0, %arg5, %c0, %31] : memref<3x3x3x8xvector<4xf32>>
%33 = divi_signed %30, %c4 : index
%34 = memref.load %1[%arg0, %arg5, %c1, %33] : memref<3x3x3x8xvector<4xf32>>
%35 = divi_signed %30, %c4 : index
%36 = memref.load %1[%arg0, %arg5, %c2, %35] : memref<3x3x3x8xvector<4xf32>>
%37 = addi %13, %arg0 : index
%38 = addi %14, %arg5 : index
%39 = addi %6, %37 : index
%40 = addi %7, %38 : index
%41 = vector.transfer_read %0[%c0, %39, %40, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%42 = vector.extract %41[0] : vector<3xf32>
%43 = vector.broadcast %42 : f32 to vector<4xf32>
%44 = mulf %43, %32 : vector<4xf32>
%45 = addf %44, %arg6 : vector<4xf32>
%46 = vector.extract %41[1] : vector<3xf32>
%47 = vector.broadcast %46 : f32 to vector<4xf32>
%48 = mulf %47, %34 : vector<4xf32>
%49 = addf %48, %45 : vector<4xf32>
%50 = vector.extract %41[2] : vector<3xf32>
%51 = vector.broadcast %50 : f32 to vector<4xf32>
%52 = mulf %51, %36 : vector<4xf32>
%53 = addf %52, %49 : vector<4xf32>
%54 = addi %arg0, %c2 : index
%55 = addi %13, %54 : index
%56 = addi %6, %55 : index
%57 = vector.transfer_read %0[%c0, %56, %40, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%58 = vector.extract %57[0] : vector<3xf32>
%59 = vector.broadcast %58 : f32 to vector<4xf32>
%60 = mulf %59, %32 : vector<4xf32>
%61 = addf %60, %arg7 : vector<4xf32>
%62 = vector.extract %57[1] : vector<3xf32>
%63 = vector.broadcast %62 : f32 to vector<4xf32>
%64 = mulf %63, %34 : vector<4xf32>
%65 = addf %64, %61 : vector<4xf32>
%66 = vector.extract %57[2] : vector<3xf32>
%67 = vector.broadcast %66 : f32 to vector<4xf32>
%68 = mulf %67, %36 : vector<4xf32>
%69 = addf %68, %65 : vector<4xf32>
%70 = addi %arg0, %c4 : index
%71 = addi %13, %70 : index
%72 = addi %6, %71 : index
%73 = vector.transfer_read %0[%c0, %72, %40, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%74 = vector.extract %73[0] : vector<3xf32>
%75 = vector.broadcast %74 : f32 to vector<4xf32>
%76 = mulf %75, %32 : vector<4xf32>
%77 = addf %76, %arg8 : vector<4xf32>
%78 = vector.extract %73[1] : vector<3xf32>
%79 = vector.broadcast %78 : f32 to vector<4xf32>
%80 = mulf %79, %34 : vector<4xf32>
%81 = addf %80, %77 : vector<4xf32>
%82 = vector.extract %73[2] : vector<3xf32>
%83 = vector.broadcast %82 : f32 to vector<4xf32>
%84 = mulf %83, %36 : vector<4xf32>
%85 = addf %84, %81 : vector<4xf32>
%86 = addi %arg0, %c6 : index
%87 = addi %13, %86 : index
%88 = addi %6, %87 : index
%89 = vector.transfer_read %0[%c0, %88, %40, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%90 = vector.extract %89[0] : vector<3xf32>
%91 = vector.broadcast %90 : f32 to vector<4xf32>
%92 = mulf %91, %32 : vector<4xf32>
%93 = addf %92, %arg9 : vector<4xf32>
%94 = vector.extract %89[1] : vector<3xf32>
%95 = vector.broadcast %94 : f32 to vector<4xf32>
%96 = mulf %95, %34 : vector<4xf32>
%97 = addf %96, %93 : vector<4xf32>
%98 = vector.extract %89[2] : vector<3xf32>
%99 = vector.broadcast %98 : f32 to vector<4xf32>
%100 = mulf %99, %36 : vector<4xf32>
%101 = addf %100, %97 : vector<4xf32>
scf.yield %53, %69, %85, %101 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %29#0, %29#1, %29#2, %29#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%16 = addi %11, %c3 : index
%17 = addi %3, %16 : index
%18 = addi %4, %9 : index
%19 = addi %5, %12 : index
%20 = divi_signed %19, %c4 : index
memref.store %15#3, %2[%c0, %17, %18, %20] : memref<1x112x112x8xvector<4xf32>>
%21 = addi %11, %c2 : index
%22 = addi %3, %21 : index
%23 = divi_signed %19, %c4 : index
memref.store %15#2, %2[%c0, %22, %18, %23] : memref<1x112x112x8xvector<4xf32>>
%24 = addi %11, %c1 : index
%25 = addi %3, %24 : index
%26 = divi_signed %19, %c4 : index
memref.store %15#1, %2[%c0, %25, %18, %26] : memref<1x112x112x8xvector<4xf32>>
%27 = addi %3, %11 : index
%28 = divi_signed %19, %c4 : index
memref.store %15#0, %2[%c0, %27, %18, %28] : memref<1x112x112x8xvector<4xf32>>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%cst = constant dense<0.000000e+00> : vector<4xf32>
%c4 = constant 4 : index
%c2 = constant 2 : index
%c6 = constant 6 : index
%cst_0 = constant 0.000000e+00 : f32
%c1 = constant 1 : index
%c3 = constant 3 : index
%c0 = constant 0 : index
%c16 = constant 16 : index
%c8 = constant 8 : index
%0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
%1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x8xvector<4xf32>>
%2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x8xvector<4xf32>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%3 = muli %workgroup_id_z, %c4 : index
%4 = muli %workgroup_id_y, %c4 : index
%5 = muli %workgroup_id_x, %c16 : index
%6 = muli %workgroup_id_z, %c8 : index
%7 = muli %workgroup_id_y, %c8 : index
%8 = "gpu.thread_id"() {dimension = "x"} : () -> index
%9 = "gpu.thread_id"() {dimension = "y"} : () -> index
%10 = "gpu.thread_id"() {dimension = "z"} : () -> index
%11 = muli %10, %c4 : index
%12 = muli %8, %c4 : index
%13 = muli %10, %c8 : index
%14 = muli %9, %c2 : index
%15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%26:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
%27 = addi %5, %12 : index
%28 = divi_signed %27, %c4 : index
%29 = memref.load %1[%arg0, %arg5, %c0, %28] : memref<3x3x3x8xvector<4xf32>>
%30 = memref.load %1[%arg0, %arg5, %c1, %28] : memref<3x3x3x8xvector<4xf32>>
%31 = memref.load %1[%arg0, %arg5, %c2, %28] : memref<3x3x3x8xvector<4xf32>>
%32 = addi %13, %arg0 : index
%33 = addi %14, %arg5 : index
%34 = addi %6, %32 : index
%35 = addi %7, %33 : index
%36 = vector.transfer_read %0[%c0, %34, %35, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%37 = vector.extract %36[0] : vector<3xf32>
%38 = vector.broadcast %37 : f32 to vector<4xf32>
%39 = mulf %38, %29 : vector<4xf32>
%40 = addf %39, %arg6 : vector<4xf32>
%41 = vector.extract %36[1] : vector<3xf32>
%42 = vector.broadcast %41 : f32 to vector<4xf32>
%43 = mulf %42, %30 : vector<4xf32>
%44 = addf %43, %40 : vector<4xf32>
%45 = vector.extract %36[2] : vector<3xf32>
%46 = vector.broadcast %45 : f32 to vector<4xf32>
%47 = mulf %46, %31 : vector<4xf32>
%48 = addf %47, %44 : vector<4xf32>
%49 = addi %arg0, %c2 : index
%50 = addi %13, %49 : index
%51 = addi %6, %50 : index
%52 = vector.transfer_read %0[%c0, %51, %35, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%53 = vector.extract %52[0] : vector<3xf32>
%54 = vector.broadcast %53 : f32 to vector<4xf32>
%55 = mulf %54, %29 : vector<4xf32>
%56 = addf %55, %arg7 : vector<4xf32>
%57 = vector.extract %52[1] : vector<3xf32>
%58 = vector.broadcast %57 : f32 to vector<4xf32>
%59 = mulf %58, %30 : vector<4xf32>
%60 = addf %59, %56 : vector<4xf32>
%61 = vector.extract %52[2] : vector<3xf32>
%62 = vector.broadcast %61 : f32 to vector<4xf32>
%63 = mulf %62, %31 : vector<4xf32>
%64 = addf %63, %60 : vector<4xf32>
%65 = addi %arg0, %c4 : index
%66 = addi %13, %65 : index
%67 = addi %6, %66 : index
%68 = vector.transfer_read %0[%c0, %67, %35, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%69 = vector.extract %68[0] : vector<3xf32>
%70 = vector.broadcast %69 : f32 to vector<4xf32>
%71 = mulf %70, %29 : vector<4xf32>
%72 = addf %71, %arg8 : vector<4xf32>
%73 = vector.extract %68[1] : vector<3xf32>
%74 = vector.broadcast %73 : f32 to vector<4xf32>
%75 = mulf %74, %30 : vector<4xf32>
%76 = addf %75, %72 : vector<4xf32>
%77 = vector.extract %68[2] : vector<3xf32>
%78 = vector.broadcast %77 : f32 to vector<4xf32>
%79 = mulf %78, %31 : vector<4xf32>
%80 = addf %79, %76 : vector<4xf32>
%81 = addi %arg0, %c6 : index
%82 = addi %13, %81 : index
%83 = addi %6, %82 : index
%84 = vector.transfer_read %0[%c0, %83, %35, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
%85 = vector.extract %84[0] : vector<3xf32>
%86 = vector.broadcast %85 : f32 to vector<4xf32>
%87 = mulf %86, %29 : vector<4xf32>
%88 = addf %87, %arg9 : vector<4xf32>
%89 = vector.extract %84[1] : vector<3xf32>
%90 = vector.broadcast %89 : f32 to vector<4xf32>
%91 = mulf %90, %30 : vector<4xf32>
%92 = addf %91, %88 : vector<4xf32>
%93 = vector.extract %84[2] : vector<3xf32>
%94 = vector.broadcast %93 : f32 to vector<4xf32>
%95 = mulf %94, %31 : vector<4xf32>
%96 = addf %95, %92 : vector<4xf32>
scf.yield %48, %64, %80, %96 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
scf.yield %26#0, %26#1, %26#2, %26#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
}
%16 = addi %11, %c3 : index
%17 = addi %3, %16 : index
%18 = addi %4, %9 : index
%19 = addi %5, %12 : index
%20 = divi_signed %19, %c4 : index
memref.store %15#3, %2[%c0, %17, %18, %20] : memref<1x112x112x8xvector<4xf32>>
%21 = addi %11, %c2 : index
%22 = addi %3, %21 : index
memref.store %15#2, %2[%c0, %22, %18, %20] : memref<1x112x112x8xvector<4xf32>>
%23 = addi %11, %c1 : index
%24 = addi %3, %23 : index
memref.store %15#1, %2[%c0, %24, %18, %20] : memref<1x112x112x8xvector<4xf32>>
%25 = addi %3, %11 : index
memref.store %15#0, %2[%c0, %25, %18, %20] : memref<1x112x112x8xvector<4xf32>>
return
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ConvertToSPIRVPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 0.000000e+00 : f32
%5 = spv.Constant 1 : i32
%6 = spv.Constant 3 : i32
%7 = spv.Constant 0 : i32
%8 = spv.Constant 16 : i32
%9 = spv.Constant 8 : i32
%10 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%11 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%12 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%13 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%14 = spv.Load "Input" %13 : vector<3xi32>
%15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
%19 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%20 = spv.Load "Input" %19 : vector<3xi32>
%21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
%22 = spv.IMul %21, %1 : i32
%23 = spv.IMul %18, %1 : i32
%24 = spv.IMul %15, %8 : i32
%25 = spv.IMul %21, %9 : i32
%26 = spv.IMul %18, %9 : i32
%27 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%28 = spv.Load "Input" %27 : vector<3xi32>
%29 = spv.CompositeExtract %28[0 : i32] : vector<3xi32>
%30 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%31 = spv.Load "Input" %30 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%34 = spv.Load "Input" %33 : vector<3xi32>
%35 = spv.CompositeExtract %34[2 : i32] : vector<3xi32>
%36 = spv.IMul %35, %1 : i32
%37 = spv.IMul %29, %1 : i32
%38 = spv.IMul %35, %9 : i32
%39 = spv.IMul %32, %2 : i32
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%43 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%7, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%118: i32, %119: vector<4xf32>, %120: vector<4xf32>, %121: vector<4xf32>, %122: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%123 = spv.SLessThan %118, %6 : i32
spv.BranchConditional %123, ^bb2, ^bb3
^bb2: // pred: ^bb1
%124 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%125 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%126 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%127 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%7, %119, %120, %121, %122 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%133: i32, %134: vector<4xf32>, %135: vector<4xf32>, %136: vector<4xf32>, %137: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%138 = spv.SLessThan %133, %6 : i32
spv.BranchConditional %138, ^bb2, ^bb3
^bb2: // pred: ^bb1
%139 = spv.IAdd %24, %37 : i32
%140 = spv.SDiv %139, %1 : i32
%141 = spv.Constant 0 : i32
%142 = spv.Constant 0 : i32
%143 = spv.Constant 72 : i32
%144 = spv.IMul %143, %118 : i32
%145 = spv.IAdd %142, %144 : i32
%146 = spv.Constant 24 : i32
%147 = spv.IMul %146, %133 : i32
%148 = spv.IAdd %145, %147 : i32
%149 = spv.Constant 8 : i32
%150 = spv.IMul %149, %7 : i32
%151 = spv.IAdd %148, %150 : i32
%152 = spv.Constant 1 : i32
%153 = spv.IMul %152, %140 : i32
%154 = spv.IAdd %151, %153 : i32
%155 = spv.AccessChain %11[%141, %154] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%156 = spv.Load "StorageBuffer" %155 : vector<4xf32>
%157 = spv.Constant 0 : i32
%158 = spv.Constant 0 : i32
%159 = spv.Constant 72 : i32
%160 = spv.IMul %159, %118 : i32
%161 = spv.IAdd %158, %160 : i32
%162 = spv.Constant 24 : i32
%163 = spv.IMul %162, %133 : i32
%164 = spv.IAdd %161, %163 : i32
%165 = spv.Constant 8 : i32
%166 = spv.IMul %165, %5 : i32
%167 = spv.IAdd %164, %166 : i32
%168 = spv.Constant 1 : i32
%169 = spv.IMul %168, %140 : i32
%170 = spv.IAdd %167, %169 : i32
%171 = spv.AccessChain %11[%157, %170] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%172 = spv.Load "StorageBuffer" %171 : vector<4xf32>
%173 = spv.Constant 0 : i32
%174 = spv.Constant 0 : i32
%175 = spv.Constant 72 : i32
%176 = spv.IMul %175, %118 : i32
%177 = spv.IAdd %174, %176 : i32
%178 = spv.Constant 24 : i32
%179 = spv.IMul %178, %133 : i32
%180 = spv.IAdd %177, %179 : i32
%181 = spv.Constant 8 : i32
%182 = spv.IMul %181, %2 : i32
%183 = spv.IAdd %180, %182 : i32
%184 = spv.Constant 1 : i32
%185 = spv.IMul %184, %140 : i32
%186 = spv.IAdd %183, %185 : i32
%187 = spv.AccessChain %11[%173, %186] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%188 = spv.Load "StorageBuffer" %187 : vector<4xf32>
%189 = spv.IAdd %38, %118 : i32
%190 = spv.IAdd %39, %133 : i32
%191 = spv.IAdd %25, %189 : i32
%192 = spv.IAdd %26, %190 : i32
%193 = spv.Constant 0 : i32
%194 = spv.Constant 0 : i32
%195 = spv.Constant 0 : i32
%196 = spv.Constant 151875 : i32
%197 = spv.IMul %196, %7 : i32
%198 = spv.IAdd %195, %197 : i32
%199 = spv.Constant 675 : i32
%200 = spv.IMul %199, %191 : i32
%201 = spv.IAdd %198, %200 : i32
%202 = spv.Constant 3 : i32
%203 = spv.IMul %202, %192 : i32
%204 = spv.IAdd %201, %203 : i32
%205 = spv.Constant 1 : i32
%206 = spv.IMul %205, %193 : i32
%207 = spv.IAdd %204, %206 : i32
%208 = spv.AccessChain %10[%194, %207] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%209 = spv.Load "StorageBuffer" %208 : f32
%210 = spv.Constant 1 : i32
%211 = spv.Constant 0 : i32
%212 = spv.Constant 0 : i32
%213 = spv.Constant 151875 : i32
%214 = spv.IMul %213, %7 : i32
%215 = spv.IAdd %212, %214 : i32
%216 = spv.Constant 675 : i32
%217 = spv.IMul %216, %191 : i32
%218 = spv.IAdd %215, %217 : i32
%219 = spv.Constant 3 : i32
%220 = spv.IMul %219, %192 : i32
%221 = spv.IAdd %218, %220 : i32
%222 = spv.Constant 1 : i32
%223 = spv.IMul %222, %210 : i32
%224 = spv.IAdd %221, %223 : i32
%225 = spv.AccessChain %10[%211, %224] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%226 = spv.Load "StorageBuffer" %225 : f32
%227 = spv.Constant 2 : i32
%228 = spv.Constant 0 : i32
%229 = spv.Constant 0 : i32
%230 = spv.Constant 151875 : i32
%231 = spv.IMul %230, %7 : i32
%232 = spv.IAdd %229, %231 : i32
%233 = spv.Constant 675 : i32
%234 = spv.IMul %233, %191 : i32
%235 = spv.IAdd %232, %234 : i32
%236 = spv.Constant 3 : i32
%237 = spv.IMul %236, %192 : i32
%238 = spv.IAdd %235, %237 : i32
%239 = spv.Constant 1 : i32
%240 = spv.IMul %239, %227 : i32
%241 = spv.IAdd %238, %240 : i32
%242 = spv.AccessChain %10[%228, %241] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%243 = spv.Load "StorageBuffer" %242 : f32
%244 = spv.CompositeConstruct %209, %226, %243 : vector<3xf32>
%245 = spv.CompositeExtract %244[0 : i32] : vector<3xf32>
%246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
%247 = spv.FMul %246, %156 : vector<4xf32>
%248 = spv.FAdd %247, %134 : vector<4xf32>
%249 = spv.CompositeExtract %244[1 : i32] : vector<3xf32>
%250 = spv.CompositeConstruct %249, %249, %249, %249 : vector<4xf32>
%251 = spv.FMul %250, %172 : vector<4xf32>
%252 = spv.FAdd %251, %248 : vector<4xf32>
%253 = spv.CompositeExtract %244[2 : i32] : vector<3xf32>
%254 = spv.CompositeConstruct %253, %253, %253, %253 : vector<4xf32>
%255 = spv.FMul %254, %188 : vector<4xf32>
%256 = spv.FAdd %255, %252 : vector<4xf32>
%257 = spv.IAdd %118, %2 : i32
%258 = spv.IAdd %38, %257 : i32
%259 = spv.IAdd %25, %258 : i32
%260 = spv.Constant 0 : i32
%261 = spv.Constant 0 : i32
%262 = spv.Constant 0 : i32
%263 = spv.Constant 151875 : i32
%264 = spv.IMul %263, %7 : i32
%265 = spv.IAdd %262, %264 : i32
%266 = spv.Constant 675 : i32
%267 = spv.IMul %266, %259 : i32
%268 = spv.IAdd %265, %267 : i32
%269 = spv.Constant 3 : i32
%270 = spv.IMul %269, %192 : i32
%271 = spv.IAdd %268, %270 : i32
%272 = spv.Constant 1 : i32
%273 = spv.IMul %272, %260 : i32
%274 = spv.IAdd %271, %273 : i32
%275 = spv.AccessChain %10[%261, %274] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%276 = spv.Load "StorageBuffer" %275 : f32
%277 = spv.Constant 1 : i32
%278 = spv.Constant 0 : i32
%279 = spv.Constant 0 : i32
%280 = spv.Constant 151875 : i32
%281 = spv.IMul %280, %7 : i32
%282 = spv.IAdd %279, %281 : i32
%283 = spv.Constant 675 : i32
%284 = spv.IMul %283, %259 : i32
%285 = spv.IAdd %282, %284 : i32
%286 = spv.Constant 3 : i32
%287 = spv.IMul %286, %192 : i32
%288 = spv.IAdd %285, %287 : i32
%289 = spv.Constant 1 : i32
%290 = spv.IMul %289, %277 : i32
%291 = spv.IAdd %288, %290 : i32
%292 = spv.AccessChain %10[%278, %291] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%293 = spv.Load "StorageBuffer" %292 : f32
%294 = spv.Constant 2 : i32
%295 = spv.Constant 0 : i32
%296 = spv.Constant 0 : i32
%297 = spv.Constant 151875 : i32
%298 = spv.IMul %297, %7 : i32
%299 = spv.IAdd %296, %298 : i32
%300 = spv.Constant 675 : i32
%301 = spv.IMul %300, %259 : i32
%302 = spv.IAdd %299, %301 : i32
%303 = spv.Constant 3 : i32
%304 = spv.IMul %303, %192 : i32
%305 = spv.IAdd %302, %304 : i32
%306 = spv.Constant 1 : i32
%307 = spv.IMul %306, %294 : i32
%308 = spv.IAdd %305, %307 : i32
%309 = spv.AccessChain %10[%295, %308] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%310 = spv.Load "StorageBuffer" %309 : f32
%311 = spv.CompositeConstruct %276, %293, %310 : vector<3xf32>
%312 = spv.CompositeExtract %311[0 : i32] : vector<3xf32>
%313 = spv.CompositeConstruct %312, %312, %312, %312 : vector<4xf32>
%314 = spv.FMul %313, %156 : vector<4xf32>
%315 = spv.FAdd %314, %135 : vector<4xf32>
%316 = spv.CompositeExtract %311[1 : i32] : vector<3xf32>
%317 = spv.CompositeConstruct %316, %316, %316, %316 : vector<4xf32>
%318 = spv.FMul %317, %172 : vector<4xf32>
%319 = spv.FAdd %318, %315 : vector<4xf32>
%320 = spv.CompositeExtract %311[2 : i32] : vector<3xf32>
%321 = spv.CompositeConstruct %320, %320, %320, %320 : vector<4xf32>
%322 = spv.FMul %321, %188 : vector<4xf32>
%323 = spv.FAdd %322, %319 : vector<4xf32>
%324 = spv.IAdd %118, %1 : i32
%325 = spv.IAdd %38, %324 : i32
%326 = spv.IAdd %25, %325 : i32
%327 = spv.Constant 0 : i32
%328 = spv.Constant 0 : i32
%329 = spv.Constant 0 : i32
%330 = spv.Constant 151875 : i32
%331 = spv.IMul %330, %7 : i32
%332 = spv.IAdd %329, %331 : i32
%333 = spv.Constant 675 : i32
%334 = spv.IMul %333, %326 : i32
%335 = spv.IAdd %332, %334 : i32
%336 = spv.Constant 3 : i32
%337 = spv.IMul %336, %192 : i32
%338 = spv.IAdd %335, %337 : i32
%339 = spv.Constant 1 : i32
%340 = spv.IMul %339, %327 : i32
%341 = spv.IAdd %338, %340 : i32
%342 = spv.AccessChain %10[%328, %341] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%343 = spv.Load "StorageBuffer" %342 : f32
%344 = spv.Constant 1 : i32
%345 = spv.Constant 0 : i32
%346 = spv.Constant 0 : i32
%347 = spv.Constant 151875 : i32
%348 = spv.IMul %347, %7 : i32
%349 = spv.IAdd %346, %348 : i32
%350 = spv.Constant 675 : i32
%351 = spv.IMul %350, %326 : i32
%352 = spv.IAdd %349, %351 : i32
%353 = spv.Constant 3 : i32
%354 = spv.IMul %353, %192 : i32
%355 = spv.IAdd %352, %354 : i32
%356 = spv.Constant 1 : i32
%357 = spv.IMul %356, %344 : i32
%358 = spv.IAdd %355, %357 : i32
%359 = spv.AccessChain %10[%345, %358] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%360 = spv.Load "StorageBuffer" %359 : f32
%361 = spv.Constant 2 : i32
%362 = spv.Constant 0 : i32
%363 = spv.Constant 0 : i32
%364 = spv.Constant 151875 : i32
%365 = spv.IMul %364, %7 : i32
%366 = spv.IAdd %363, %365 : i32
%367 = spv.Constant 675 : i32
%368 = spv.IMul %367, %326 : i32
%369 = spv.IAdd %366, %368 : i32
%370 = spv.Constant 3 : i32
%371 = spv.IMul %370, %192 : i32
%372 = spv.IAdd %369, %371 : i32
%373 = spv.Constant 1 : i32
%374 = spv.IMul %373, %361 : i32
%375 = spv.IAdd %372, %374 : i32
%376 = spv.AccessChain %10[%362, %375] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%377 = spv.Load "StorageBuffer" %376 : f32
%378 = spv.CompositeConstruct %343, %360, %377 : vector<3xf32>
%379 = spv.CompositeExtract %378[0 : i32] : vector<3xf32>
%380 = spv.CompositeConstruct %379, %379, %379, %379 : vector<4xf32>
%381 = spv.FMul %380, %156 : vector<4xf32>
%382 = spv.FAdd %381, %136 : vector<4xf32>
%383 = spv.CompositeExtract %378[1 : i32] : vector<3xf32>
%384 = spv.CompositeConstruct %383, %383, %383, %383 : vector<4xf32>
%385 = spv.FMul %384, %172 : vector<4xf32>
%386 = spv.FAdd %385, %382 : vector<4xf32>
%387 = spv.CompositeExtract %378[2 : i32] : vector<3xf32>
%388 = spv.CompositeConstruct %387, %387, %387, %387 : vector<4xf32>
%389 = spv.FMul %388, %188 : vector<4xf32>
%390 = spv.FAdd %389, %386 : vector<4xf32>
%391 = spv.IAdd %118, %3 : i32
%392 = spv.IAdd %38, %391 : i32
%393 = spv.IAdd %25, %392 : i32
%394 = spv.Constant 0 : i32
%395 = spv.Constant 0 : i32
%396 = spv.Constant 0 : i32
%397 = spv.Constant 151875 : i32
%398 = spv.IMul %397, %7 : i32
%399 = spv.IAdd %396, %398 : i32
%400 = spv.Constant 675 : i32
%401 = spv.IMul %400, %393 : i32
%402 = spv.IAdd %399, %401 : i32
%403 = spv.Constant 3 : i32
%404 = spv.IMul %403, %192 : i32
%405 = spv.IAdd %402, %404 : i32
%406 = spv.Constant 1 : i32
%407 = spv.IMul %406, %394 : i32
%408 = spv.IAdd %405, %407 : i32
%409 = spv.AccessChain %10[%395, %408] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%410 = spv.Load "StorageBuffer" %409 : f32
%411 = spv.Constant 1 : i32
%412 = spv.Constant 0 : i32
%413 = spv.Constant 0 : i32
%414 = spv.Constant 151875 : i32
%415 = spv.IMul %414, %7 : i32
%416 = spv.IAdd %413, %415 : i32
%417 = spv.Constant 675 : i32
%418 = spv.IMul %417, %393 : i32
%419 = spv.IAdd %416, %418 : i32
%420 = spv.Constant 3 : i32
%421 = spv.IMul %420, %192 : i32
%422 = spv.IAdd %419, %421 : i32
%423 = spv.Constant 1 : i32
%424 = spv.IMul %423, %411 : i32
%425 = spv.IAdd %422, %424 : i32
%426 = spv.AccessChain %10[%412, %425] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%427 = spv.Load "StorageBuffer" %426 : f32
%428 = spv.Constant 2 : i32
%429 = spv.Constant 0 : i32
%430 = spv.Constant 0 : i32
%431 = spv.Constant 151875 : i32
%432 = spv.IMul %431, %7 : i32
%433 = spv.IAdd %430, %432 : i32
%434 = spv.Constant 675 : i32
%435 = spv.IMul %434, %393 : i32
%436 = spv.IAdd %433, %435 : i32
%437 = spv.Constant 3 : i32
%438 = spv.IMul %437, %192 : i32
%439 = spv.IAdd %436, %438 : i32
%440 = spv.Constant 1 : i32
%441 = spv.IMul %440, %428 : i32
%442 = spv.IAdd %439, %441 : i32
%443 = spv.AccessChain %10[%429, %442] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%444 = spv.Load "StorageBuffer" %443 : f32
%445 = spv.CompositeConstruct %410, %427, %444 : vector<3xf32>
%446 = spv.CompositeExtract %445[0 : i32] : vector<3xf32>
%447 = spv.CompositeConstruct %446, %446, %446, %446 : vector<4xf32>
%448 = spv.FMul %447, %156 : vector<4xf32>
%449 = spv.FAdd %448, %137 : vector<4xf32>
%450 = spv.CompositeExtract %445[1 : i32] : vector<3xf32>
%451 = spv.CompositeConstruct %450, %450, %450, %450 : vector<4xf32>
%452 = spv.FMul %451, %172 : vector<4xf32>
%453 = spv.FAdd %452, %449 : vector<4xf32>
%454 = spv.CompositeExtract %445[2 : i32] : vector<3xf32>
%455 = spv.CompositeConstruct %454, %454, %454, %454 : vector<4xf32>
%456 = spv.FMul %455, %188 : vector<4xf32>
%457 = spv.FAdd %456, %453 : vector<4xf32>
spv.Store "Function" %124, %256 : vector<4xf32>
spv.Store "Function" %125, %323 : vector<4xf32>
spv.Store "Function" %126, %390 : vector<4xf32>
spv.Store "Function" %127, %457 : vector<4xf32>
%458 = spv.IAdd %133, %5 : i32
spv.Branch ^bb1(%458, %256, %323, %390, %457 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%128 = spv.Load "Function" %127 : vector<4xf32>
%129 = spv.Load "Function" %126 : vector<4xf32>
%130 = spv.Load "Function" %125 : vector<4xf32>
%131 = spv.Load "Function" %124 : vector<4xf32>
spv.Store "Function" %40, %131 : vector<4xf32>
spv.Store "Function" %41, %130 : vector<4xf32>
spv.Store "Function" %42, %129 : vector<4xf32>
spv.Store "Function" %43, %128 : vector<4xf32>
%132 = spv.IAdd %118, %5 : i32
spv.Branch ^bb1(%132, %131, %130, %129, %128 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%44 = spv.Load "Function" %43 : vector<4xf32>
%45 = spv.Load "Function" %42 : vector<4xf32>
%46 = spv.Load "Function" %41 : vector<4xf32>
%47 = spv.Load "Function" %40 : vector<4xf32>
%48 = spv.IAdd %36, %6 : i32
%49 = spv.IAdd %22, %48 : i32
%50 = spv.IAdd %23, %32 : i32
%51 = spv.IAdd %24, %37 : i32
%52 = spv.SDiv %51, %1 : i32
%53 = spv.Constant 0 : i32
%54 = spv.Constant 0 : i32
%55 = spv.Constant 100352 : i32
%56 = spv.IMul %55, %7 : i32
%57 = spv.IAdd %54, %56 : i32
%58 = spv.Constant 896 : i32
%59 = spv.IMul %58, %49 : i32
%60 = spv.IAdd %57, %59 : i32
%61 = spv.Constant 8 : i32
%62 = spv.IMul %61, %50 : i32
%63 = spv.IAdd %60, %62 : i32
%64 = spv.Constant 1 : i32
%65 = spv.IMul %64, %52 : i32
%66 = spv.IAdd %63, %65 : i32
%67 = spv.AccessChain %12[%53, %66] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %67, %44 : vector<4xf32>
%68 = spv.IAdd %36, %2 : i32
%69 = spv.IAdd %22, %68 : i32
%70 = spv.Constant 0 : i32
%71 = spv.Constant 0 : i32
%72 = spv.Constant 100352 : i32
%73 = spv.IMul %72, %7 : i32
%74 = spv.IAdd %71, %73 : i32
%75 = spv.Constant 896 : i32
%76 = spv.IMul %75, %69 : i32
%77 = spv.IAdd %74, %76 : i32
%78 = spv.Constant 8 : i32
%79 = spv.IMul %78, %50 : i32
%80 = spv.IAdd %77, %79 : i32
%81 = spv.Constant 1 : i32
%82 = spv.IMul %81, %52 : i32
%83 = spv.IAdd %80, %82 : i32
%84 = spv.AccessChain %12[%70, %83] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %84, %45 : vector<4xf32>
%85 = spv.IAdd %36, %5 : i32
%86 = spv.IAdd %22, %85 : i32
%87 = spv.Constant 0 : i32
%88 = spv.Constant 0 : i32
%89 = spv.Constant 100352 : i32
%90 = spv.IMul %89, %7 : i32
%91 = spv.IAdd %88, %90 : i32
%92 = spv.Constant 896 : i32
%93 = spv.IMul %92, %86 : i32
%94 = spv.IAdd %91, %93 : i32
%95 = spv.Constant 8 : i32
%96 = spv.IMul %95, %50 : i32
%97 = spv.IAdd %94, %96 : i32
%98 = spv.Constant 1 : i32
%99 = spv.IMul %98, %52 : i32
%100 = spv.IAdd %97, %99 : i32
%101 = spv.AccessChain %12[%87, %100] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %101, %46 : vector<4xf32>
%102 = spv.IAdd %22, %36 : i32
%103 = spv.Constant 0 : i32
%104 = spv.Constant 0 : i32
%105 = spv.Constant 100352 : i32
%106 = spv.IMul %105, %7 : i32
%107 = spv.IAdd %104, %106 : i32
%108 = spv.Constant 896 : i32
%109 = spv.IMul %108, %102 : i32
%110 = spv.IAdd %107, %109 : i32
%111 = spv.Constant 8 : i32
%112 = spv.IMul %111, %50 : i32
%113 = spv.IAdd %110, %112 : i32
%114 = spv.Constant 1 : i32
%115 = spv.IMul %114, %52 : i32
%116 = spv.IAdd %113, %115 : i32
%117 = spv.AccessChain %12[%103, %116] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %117, %47 : vector<4xf32>
spv.Return
}
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// *** IR Dump After SPIRVLowerABIAttributes ***
spv.module Logical GLSL450 {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 0.000000e+00 : f32
%5 = spv.Constant 1 : i32
%6 = spv.Constant 3 : i32
%7 = spv.Constant 0 : i32
%8 = spv.Constant 16 : i32
%9 = spv.Constant 8 : i32
%10 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%11 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%12 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%13 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%14 = spv.Load "Input" %13 : vector<3xi32>
%15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
%19 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%20 = spv.Load "Input" %19 : vector<3xi32>
%21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
%22 = spv.IMul %21, %1 : i32
%23 = spv.IMul %18, %1 : i32
%24 = spv.IMul %15, %8 : i32
%25 = spv.IMul %21, %9 : i32
%26 = spv.IMul %18, %9 : i32
%27 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%28 = spv.Load "Input" %27 : vector<3xi32>
%29 = spv.CompositeExtract %28[0 : i32] : vector<3xi32>
%30 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%31 = spv.Load "Input" %30 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%34 = spv.Load "Input" %33 : vector<3xi32>
%35 = spv.CompositeExtract %34[2 : i32] : vector<3xi32>
%36 = spv.IMul %35, %1 : i32
%37 = spv.IMul %29, %1 : i32
%38 = spv.IMul %35, %9 : i32
%39 = spv.IMul %32, %2 : i32
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%43 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%7, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%118: i32, %119: vector<4xf32>, %120: vector<4xf32>, %121: vector<4xf32>, %122: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%123 = spv.SLessThan %118, %6 : i32
spv.BranchConditional %123, ^bb2, ^bb3
^bb2: // pred: ^bb1
%124 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%125 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%126 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%127 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%7, %119, %120, %121, %122 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%133: i32, %134: vector<4xf32>, %135: vector<4xf32>, %136: vector<4xf32>, %137: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%138 = spv.SLessThan %133, %6 : i32
spv.BranchConditional %138, ^bb2, ^bb3
^bb2: // pred: ^bb1
%139 = spv.IAdd %24, %37 : i32
%140 = spv.SDiv %139, %1 : i32
%141 = spv.Constant 0 : i32
%142 = spv.Constant 0 : i32
%143 = spv.Constant 72 : i32
%144 = spv.IMul %143, %118 : i32
%145 = spv.IAdd %142, %144 : i32
%146 = spv.Constant 24 : i32
%147 = spv.IMul %146, %133 : i32
%148 = spv.IAdd %145, %147 : i32
%149 = spv.Constant 8 : i32
%150 = spv.IMul %149, %7 : i32
%151 = spv.IAdd %148, %150 : i32
%152 = spv.Constant 1 : i32
%153 = spv.IMul %152, %140 : i32
%154 = spv.IAdd %151, %153 : i32
%155 = spv.AccessChain %11[%141, %154] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%156 = spv.Load "StorageBuffer" %155 : vector<4xf32>
%157 = spv.Constant 0 : i32
%158 = spv.Constant 0 : i32
%159 = spv.Constant 72 : i32
%160 = spv.IMul %159, %118 : i32
%161 = spv.IAdd %158, %160 : i32
%162 = spv.Constant 24 : i32
%163 = spv.IMul %162, %133 : i32
%164 = spv.IAdd %161, %163 : i32
%165 = spv.Constant 8 : i32
%166 = spv.IMul %165, %5 : i32
%167 = spv.IAdd %164, %166 : i32
%168 = spv.Constant 1 : i32
%169 = spv.IMul %168, %140 : i32
%170 = spv.IAdd %167, %169 : i32
%171 = spv.AccessChain %11[%157, %170] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%172 = spv.Load "StorageBuffer" %171 : vector<4xf32>
%173 = spv.Constant 0 : i32
%174 = spv.Constant 0 : i32
%175 = spv.Constant 72 : i32
%176 = spv.IMul %175, %118 : i32
%177 = spv.IAdd %174, %176 : i32
%178 = spv.Constant 24 : i32
%179 = spv.IMul %178, %133 : i32
%180 = spv.IAdd %177, %179 : i32
%181 = spv.Constant 8 : i32
%182 = spv.IMul %181, %2 : i32
%183 = spv.IAdd %180, %182 : i32
%184 = spv.Constant 1 : i32
%185 = spv.IMul %184, %140 : i32
%186 = spv.IAdd %183, %185 : i32
%187 = spv.AccessChain %11[%173, %186] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%188 = spv.Load "StorageBuffer" %187 : vector<4xf32>
%189 = spv.IAdd %38, %118 : i32
%190 = spv.IAdd %39, %133 : i32
%191 = spv.IAdd %25, %189 : i32
%192 = spv.IAdd %26, %190 : i32
%193 = spv.Constant 0 : i32
%194 = spv.Constant 0 : i32
%195 = spv.Constant 0 : i32
%196 = spv.Constant 151875 : i32
%197 = spv.IMul %196, %7 : i32
%198 = spv.IAdd %195, %197 : i32
%199 = spv.Constant 675 : i32
%200 = spv.IMul %199, %191 : i32
%201 = spv.IAdd %198, %200 : i32
%202 = spv.Constant 3 : i32
%203 = spv.IMul %202, %192 : i32
%204 = spv.IAdd %201, %203 : i32
%205 = spv.Constant 1 : i32
%206 = spv.IMul %205, %193 : i32
%207 = spv.IAdd %204, %206 : i32
%208 = spv.AccessChain %10[%194, %207] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%209 = spv.Load "StorageBuffer" %208 : f32
%210 = spv.Constant 1 : i32
%211 = spv.Constant 0 : i32
%212 = spv.Constant 0 : i32
%213 = spv.Constant 151875 : i32
%214 = spv.IMul %213, %7 : i32
%215 = spv.IAdd %212, %214 : i32
%216 = spv.Constant 675 : i32
%217 = spv.IMul %216, %191 : i32
%218 = spv.IAdd %215, %217 : i32
%219 = spv.Constant 3 : i32
%220 = spv.IMul %219, %192 : i32
%221 = spv.IAdd %218, %220 : i32
%222 = spv.Constant 1 : i32
%223 = spv.IMul %222, %210 : i32
%224 = spv.IAdd %221, %223 : i32
%225 = spv.AccessChain %10[%211, %224] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%226 = spv.Load "StorageBuffer" %225 : f32
%227 = spv.Constant 2 : i32
%228 = spv.Constant 0 : i32
%229 = spv.Constant 0 : i32
%230 = spv.Constant 151875 : i32
%231 = spv.IMul %230, %7 : i32
%232 = spv.IAdd %229, %231 : i32
%233 = spv.Constant 675 : i32
%234 = spv.IMul %233, %191 : i32
%235 = spv.IAdd %232, %234 : i32
%236 = spv.Constant 3 : i32
%237 = spv.IMul %236, %192 : i32
%238 = spv.IAdd %235, %237 : i32
%239 = spv.Constant 1 : i32
%240 = spv.IMul %239, %227 : i32
%241 = spv.IAdd %238, %240 : i32
%242 = spv.AccessChain %10[%228, %241] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%243 = spv.Load "StorageBuffer" %242 : f32
%244 = spv.CompositeConstruct %209, %226, %243 : vector<3xf32>
%245 = spv.CompositeExtract %244[0 : i32] : vector<3xf32>
%246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
%247 = spv.FMul %246, %156 : vector<4xf32>
%248 = spv.FAdd %247, %134 : vector<4xf32>
%249 = spv.CompositeExtract %244[1 : i32] : vector<3xf32>
%250 = spv.CompositeConstruct %249, %249, %249, %249 : vector<4xf32>
%251 = spv.FMul %250, %172 : vector<4xf32>
%252 = spv.FAdd %251, %248 : vector<4xf32>
%253 = spv.CompositeExtract %244[2 : i32] : vector<3xf32>
%254 = spv.CompositeConstruct %253, %253, %253, %253 : vector<4xf32>
%255 = spv.FMul %254, %188 : vector<4xf32>
%256 = spv.FAdd %255, %252 : vector<4xf32>
%257 = spv.IAdd %118, %2 : i32
%258 = spv.IAdd %38, %257 : i32
%259 = spv.IAdd %25, %258 : i32
%260 = spv.Constant 0 : i32
%261 = spv.Constant 0 : i32
%262 = spv.Constant 0 : i32
%263 = spv.Constant 151875 : i32
%264 = spv.IMul %263, %7 : i32
%265 = spv.IAdd %262, %264 : i32
%266 = spv.Constant 675 : i32
%267 = spv.IMul %266, %259 : i32
%268 = spv.IAdd %265, %267 : i32
%269 = spv.Constant 3 : i32
%270 = spv.IMul %269, %192 : i32
%271 = spv.IAdd %268, %270 : i32
%272 = spv.Constant 1 : i32
%273 = spv.IMul %272, %260 : i32
%274 = spv.IAdd %271, %273 : i32
%275 = spv.AccessChain %10[%261, %274] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%276 = spv.Load "StorageBuffer" %275 : f32
%277 = spv.Constant 1 : i32
%278 = spv.Constant 0 : i32
%279 = spv.Constant 0 : i32
%280 = spv.Constant 151875 : i32
%281 = spv.IMul %280, %7 : i32
%282 = spv.IAdd %279, %281 : i32
%283 = spv.Constant 675 : i32
%284 = spv.IMul %283, %259 : i32
%285 = spv.IAdd %282, %284 : i32
%286 = spv.Constant 3 : i32
%287 = spv.IMul %286, %192 : i32
%288 = spv.IAdd %285, %287 : i32
%289 = spv.Constant 1 : i32
%290 = spv.IMul %289, %277 : i32
%291 = spv.IAdd %288, %290 : i32
%292 = spv.AccessChain %10[%278, %291] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%293 = spv.Load "StorageBuffer" %292 : f32
%294 = spv.Constant 2 : i32
%295 = spv.Constant 0 : i32
%296 = spv.Constant 0 : i32
%297 = spv.Constant 151875 : i32
%298 = spv.IMul %297, %7 : i32
%299 = spv.IAdd %296, %298 : i32
%300 = spv.Constant 675 : i32
%301 = spv.IMul %300, %259 : i32
%302 = spv.IAdd %299, %301 : i32
%303 = spv.Constant 3 : i32
%304 = spv.IMul %303, %192 : i32
%305 = spv.IAdd %302, %304 : i32
%306 = spv.Constant 1 : i32
%307 = spv.IMul %306, %294 : i32
%308 = spv.IAdd %305, %307 : i32
%309 = spv.AccessChain %10[%295, %308] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%310 = spv.Load "StorageBuffer" %309 : f32
%311 = spv.CompositeConstruct %276, %293, %310 : vector<3xf32>
%312 = spv.CompositeExtract %311[0 : i32] : vector<3xf32>
%313 = spv.CompositeConstruct %312, %312, %312, %312 : vector<4xf32>
%314 = spv.FMul %313, %156 : vector<4xf32>
%315 = spv.FAdd %314, %135 : vector<4xf32>
%316 = spv.CompositeExtract %311[1 : i32] : vector<3xf32>
%317 = spv.CompositeConstruct %316, %316, %316, %316 : vector<4xf32>
%318 = spv.FMul %317, %172 : vector<4xf32>
%319 = spv.FAdd %318, %315 : vector<4xf32>
%320 = spv.CompositeExtract %311[2 : i32] : vector<3xf32>
%321 = spv.CompositeConstruct %320, %320, %320, %320 : vector<4xf32>
%322 = spv.FMul %321, %188 : vector<4xf32>
%323 = spv.FAdd %322, %319 : vector<4xf32>
%324 = spv.IAdd %118, %1 : i32
%325 = spv.IAdd %38, %324 : i32
%326 = spv.IAdd %25, %325 : i32
%327 = spv.Constant 0 : i32
%328 = spv.Constant 0 : i32
%329 = spv.Constant 0 : i32
%330 = spv.Constant 151875 : i32
%331 = spv.IMul %330, %7 : i32
%332 = spv.IAdd %329, %331 : i32
%333 = spv.Constant 675 : i32
%334 = spv.IMul %333, %326 : i32
%335 = spv.IAdd %332, %334 : i32
%336 = spv.Constant 3 : i32
%337 = spv.IMul %336, %192 : i32
%338 = spv.IAdd %335, %337 : i32
%339 = spv.Constant 1 : i32
%340 = spv.IMul %339, %327 : i32
%341 = spv.IAdd %338, %340 : i32
%342 = spv.AccessChain %10[%328, %341] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%343 = spv.Load "StorageBuffer" %342 : f32
%344 = spv.Constant 1 : i32
%345 = spv.Constant 0 : i32
%346 = spv.Constant 0 : i32
%347 = spv.Constant 151875 : i32
%348 = spv.IMul %347, %7 : i32
%349 = spv.IAdd %346, %348 : i32
%350 = spv.Constant 675 : i32
%351 = spv.IMul %350, %326 : i32
%352 = spv.IAdd %349, %351 : i32
%353 = spv.Constant 3 : i32
%354 = spv.IMul %353, %192 : i32
%355 = spv.IAdd %352, %354 : i32
%356 = spv.Constant 1 : i32
%357 = spv.IMul %356, %344 : i32
%358 = spv.IAdd %355, %357 : i32
%359 = spv.AccessChain %10[%345, %358] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%360 = spv.Load "StorageBuffer" %359 : f32
%361 = spv.Constant 2 : i32
%362 = spv.Constant 0 : i32
%363 = spv.Constant 0 : i32
%364 = spv.Constant 151875 : i32
%365 = spv.IMul %364, %7 : i32
%366 = spv.IAdd %363, %365 : i32
%367 = spv.Constant 675 : i32
%368 = spv.IMul %367, %326 : i32
%369 = spv.IAdd %366, %368 : i32
%370 = spv.Constant 3 : i32
%371 = spv.IMul %370, %192 : i32
%372 = spv.IAdd %369, %371 : i32
%373 = spv.Constant 1 : i32
%374 = spv.IMul %373, %361 : i32
%375 = spv.IAdd %372, %374 : i32
%376 = spv.AccessChain %10[%362, %375] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%377 = spv.Load "StorageBuffer" %376 : f32
%378 = spv.CompositeConstruct %343, %360, %377 : vector<3xf32>
%379 = spv.CompositeExtract %378[0 : i32] : vector<3xf32>
%380 = spv.CompositeConstruct %379, %379, %379, %379 : vector<4xf32>
%381 = spv.FMul %380, %156 : vector<4xf32>
%382 = spv.FAdd %381, %136 : vector<4xf32>
%383 = spv.CompositeExtract %378[1 : i32] : vector<3xf32>
%384 = spv.CompositeConstruct %383, %383, %383, %383 : vector<4xf32>
%385 = spv.FMul %384, %172 : vector<4xf32>
%386 = spv.FAdd %385, %382 : vector<4xf32>
%387 = spv.CompositeExtract %378[2 : i32] : vector<3xf32>
%388 = spv.CompositeConstruct %387, %387, %387, %387 : vector<4xf32>
%389 = spv.FMul %388, %188 : vector<4xf32>
%390 = spv.FAdd %389, %386 : vector<4xf32>
%391 = spv.IAdd %118, %3 : i32
%392 = spv.IAdd %38, %391 : i32
%393 = spv.IAdd %25, %392 : i32
%394 = spv.Constant 0 : i32
%395 = spv.Constant 0 : i32
%396 = spv.Constant 0 : i32
%397 = spv.Constant 151875 : i32
%398 = spv.IMul %397, %7 : i32
%399 = spv.IAdd %396, %398 : i32
%400 = spv.Constant 675 : i32
%401 = spv.IMul %400, %393 : i32
%402 = spv.IAdd %399, %401 : i32
%403 = spv.Constant 3 : i32
%404 = spv.IMul %403, %192 : i32
%405 = spv.IAdd %402, %404 : i32
%406 = spv.Constant 1 : i32
%407 = spv.IMul %406, %394 : i32
%408 = spv.IAdd %405, %407 : i32
%409 = spv.AccessChain %10[%395, %408] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%410 = spv.Load "StorageBuffer" %409 : f32
%411 = spv.Constant 1 : i32
%412 = spv.Constant 0 : i32
%413 = spv.Constant 0 : i32
%414 = spv.Constant 151875 : i32
%415 = spv.IMul %414, %7 : i32
%416 = spv.IAdd %413, %415 : i32
%417 = spv.Constant 675 : i32
%418 = spv.IMul %417, %393 : i32
%419 = spv.IAdd %416, %418 : i32
%420 = spv.Constant 3 : i32
%421 = spv.IMul %420, %192 : i32
%422 = spv.IAdd %419, %421 : i32
%423 = spv.Constant 1 : i32
%424 = spv.IMul %423, %411 : i32
%425 = spv.IAdd %422, %424 : i32
%426 = spv.AccessChain %10[%412, %425] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%427 = spv.Load "StorageBuffer" %426 : f32
%428 = spv.Constant 2 : i32
%429 = spv.Constant 0 : i32
%430 = spv.Constant 0 : i32
%431 = spv.Constant 151875 : i32
%432 = spv.IMul %431, %7 : i32
%433 = spv.IAdd %430, %432 : i32
%434 = spv.Constant 675 : i32
%435 = spv.IMul %434, %393 : i32
%436 = spv.IAdd %433, %435 : i32
%437 = spv.Constant 3 : i32
%438 = spv.IMul %437, %192 : i32
%439 = spv.IAdd %436, %438 : i32
%440 = spv.Constant 1 : i32
%441 = spv.IMul %440, %428 : i32
%442 = spv.IAdd %439, %441 : i32
%443 = spv.AccessChain %10[%429, %442] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%444 = spv.Load "StorageBuffer" %443 : f32
%445 = spv.CompositeConstruct %410, %427, %444 : vector<3xf32>
%446 = spv.CompositeExtract %445[0 : i32] : vector<3xf32>
%447 = spv.CompositeConstruct %446, %446, %446, %446 : vector<4xf32>
%448 = spv.FMul %447, %156 : vector<4xf32>
%449 = spv.FAdd %448, %137 : vector<4xf32>
%450 = spv.CompositeExtract %445[1 : i32] : vector<3xf32>
%451 = spv.CompositeConstruct %450, %450, %450, %450 : vector<4xf32>
%452 = spv.FMul %451, %172 : vector<4xf32>
%453 = spv.FAdd %452, %449 : vector<4xf32>
%454 = spv.CompositeExtract %445[2 : i32] : vector<3xf32>
%455 = spv.CompositeConstruct %454, %454, %454, %454 : vector<4xf32>
%456 = spv.FMul %455, %188 : vector<4xf32>
%457 = spv.FAdd %456, %453 : vector<4xf32>
spv.Store "Function" %124, %256 : vector<4xf32>
spv.Store "Function" %125, %323 : vector<4xf32>
spv.Store "Function" %126, %390 : vector<4xf32>
spv.Store "Function" %127, %457 : vector<4xf32>
%458 = spv.IAdd %133, %5 : i32
spv.Branch ^bb1(%458, %256, %323, %390, %457 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%128 = spv.Load "Function" %127 : vector<4xf32>
%129 = spv.Load "Function" %126 : vector<4xf32>
%130 = spv.Load "Function" %125 : vector<4xf32>
%131 = spv.Load "Function" %124 : vector<4xf32>
spv.Store "Function" %40, %131 : vector<4xf32>
spv.Store "Function" %41, %130 : vector<4xf32>
spv.Store "Function" %42, %129 : vector<4xf32>
spv.Store "Function" %43, %128 : vector<4xf32>
%132 = spv.IAdd %118, %5 : i32
spv.Branch ^bb1(%132, %131, %130, %129, %128 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%44 = spv.Load "Function" %43 : vector<4xf32>
%45 = spv.Load "Function" %42 : vector<4xf32>
%46 = spv.Load "Function" %41 : vector<4xf32>
%47 = spv.Load "Function" %40 : vector<4xf32>
%48 = spv.IAdd %36, %6 : i32
%49 = spv.IAdd %22, %48 : i32
%50 = spv.IAdd %23, %32 : i32
%51 = spv.IAdd %24, %37 : i32
%52 = spv.SDiv %51, %1 : i32
%53 = spv.Constant 0 : i32
%54 = spv.Constant 0 : i32
%55 = spv.Constant 100352 : i32
%56 = spv.IMul %55, %7 : i32
%57 = spv.IAdd %54, %56 : i32
%58 = spv.Constant 896 : i32
%59 = spv.IMul %58, %49 : i32
%60 = spv.IAdd %57, %59 : i32
%61 = spv.Constant 8 : i32
%62 = spv.IMul %61, %50 : i32
%63 = spv.IAdd %60, %62 : i32
%64 = spv.Constant 1 : i32
%65 = spv.IMul %64, %52 : i32
%66 = spv.IAdd %63, %65 : i32
%67 = spv.AccessChain %12[%53, %66] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %67, %44 : vector<4xf32>
%68 = spv.IAdd %36, %2 : i32
%69 = spv.IAdd %22, %68 : i32
%70 = spv.Constant 0 : i32
%71 = spv.Constant 0 : i32
%72 = spv.Constant 100352 : i32
%73 = spv.IMul %72, %7 : i32
%74 = spv.IAdd %71, %73 : i32
%75 = spv.Constant 896 : i32
%76 = spv.IMul %75, %69 : i32
%77 = spv.IAdd %74, %76 : i32
%78 = spv.Constant 8 : i32
%79 = spv.IMul %78, %50 : i32
%80 = spv.IAdd %77, %79 : i32
%81 = spv.Constant 1 : i32
%82 = spv.IMul %81, %52 : i32
%83 = spv.IAdd %80, %82 : i32
%84 = spv.AccessChain %12[%70, %83] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %84, %45 : vector<4xf32>
%85 = spv.IAdd %36, %5 : i32
%86 = spv.IAdd %22, %85 : i32
%87 = spv.Constant 0 : i32
%88 = spv.Constant 0 : i32
%89 = spv.Constant 100352 : i32
%90 = spv.IMul %89, %7 : i32
%91 = spv.IAdd %88, %90 : i32
%92 = spv.Constant 896 : i32
%93 = spv.IMul %92, %86 : i32
%94 = spv.IAdd %91, %93 : i32
%95 = spv.Constant 8 : i32
%96 = spv.IMul %95, %50 : i32
%97 = spv.IAdd %94, %96 : i32
%98 = spv.Constant 1 : i32
%99 = spv.IMul %98, %52 : i32
%100 = spv.IAdd %97, %99 : i32
%101 = spv.AccessChain %12[%87, %100] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %101, %46 : vector<4xf32>
%102 = spv.IAdd %22, %36 : i32
%103 = spv.Constant 0 : i32
%104 = spv.Constant 0 : i32
%105 = spv.Constant 100352 : i32
%106 = spv.IMul %105, %7 : i32
%107 = spv.IAdd %104, %106 : i32
%108 = spv.Constant 896 : i32
%109 = spv.IMul %108, %102 : i32
%110 = spv.IAdd %107, %109 : i32
%111 = spv.Constant 8 : i32
%112 = spv.IMul %111, %50 : i32
%113 = spv.IAdd %110, %112 : i32
%114 = spv.Constant 1 : i32
%115 = spv.IMul %114, %52 : i32
%116 = spv.IAdd %113, %115 : i32
%117 = spv.AccessChain %12[%103, %116] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %117, %47 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
// *** IR Dump After Canonicalizer ***
spv.module Logical GLSL450 {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 1 : i32
%5 = spv.Constant 3 : i32
%6 = spv.Constant 0 : i32
%7 = spv.Constant 16 : i32
%8 = spv.Constant 8 : i32
%9 = spv.Constant 72 : i32
%10 = spv.Constant 24 : i32
%11 = spv.Constant 675 : i32
%12 = spv.Constant 896 : i32
%13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
%19 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%20 = spv.Load "Input" %19 : vector<3xi32>
%21 = spv.CompositeExtract %20[1 : i32] : vector<3xi32>
%22 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%23 = spv.Load "Input" %22 : vector<3xi32>
%24 = spv.CompositeExtract %23[2 : i32] : vector<3xi32>
%25 = spv.IMul %24, %1 : i32
%26 = spv.IMul %21, %1 : i32
%27 = spv.IMul %18, %7 : i32
%28 = spv.IMul %24, %8 : i32
%29 = spv.IMul %21, %8 : i32
%30 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%31 = spv.Load "Input" %30 : vector<3xi32>
%32 = spv.CompositeExtract %31[0 : i32] : vector<3xi32>
%33 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%34 = spv.Load "Input" %33 : vector<3xi32>
%35 = spv.CompositeExtract %34[1 : i32] : vector<3xi32>
%36 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%37 = spv.Load "Input" %36 : vector<3xi32>
%38 = spv.CompositeExtract %37[2 : i32] : vector<3xi32>
%39 = spv.IMul %38, %1 : i32
%40 = spv.IMul %32, %1 : i32
%41 = spv.IMul %38, %8 : i32
%42 = spv.IMul %35, %2 : i32
%43 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%44 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%45 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%46 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%81: i32, %82: vector<4xf32>, %83: vector<4xf32>, %84: vector<4xf32>, %85: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%86 = spv.SLessThan %81, %5 : i32
spv.BranchConditional %86, ^bb2, ^bb3
^bb2: // pred: ^bb1
%87 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%88 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%89 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%90 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %82, %83, %84, %85 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%96: i32, %97: vector<4xf32>, %98: vector<4xf32>, %99: vector<4xf32>, %100: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%101 = spv.SLessThan %96, %5 : i32
spv.BranchConditional %101, ^bb2, ^bb3
^bb2: // pred: ^bb1
%102 = spv.IAdd %27, %40 : i32
%103 = spv.SDiv %102, %1 : i32
%104 = spv.IMul %81, %9 : i32
%105 = spv.IMul %96, %10 : i32
%106 = spv.IAdd %104, %105 : i32
%107 = spv.IAdd %106, %103 : i32
%108 = spv.AccessChain %14[%6, %107] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%109 = spv.Load "StorageBuffer" %108 : vector<4xf32>
%110 = spv.IMul %81, %9 : i32
%111 = spv.IMul %96, %10 : i32
%112 = spv.IAdd %110, %111 : i32
%113 = spv.IAdd %112, %8 : i32
%114 = spv.IAdd %113, %103 : i32
%115 = spv.AccessChain %14[%6, %114] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%116 = spv.Load "StorageBuffer" %115 : vector<4xf32>
%117 = spv.IMul %81, %9 : i32
%118 = spv.IMul %96, %10 : i32
%119 = spv.IAdd %117, %118 : i32
%120 = spv.IAdd %119, %7 : i32
%121 = spv.IAdd %120, %103 : i32
%122 = spv.AccessChain %14[%6, %121] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%123 = spv.Load "StorageBuffer" %122 : vector<4xf32>
%124 = spv.IAdd %41, %81 : i32
%125 = spv.IAdd %42, %96 : i32
%126 = spv.IAdd %28, %124 : i32
%127 = spv.IAdd %29, %125 : i32
%128 = spv.IMul %126, %11 : i32
%129 = spv.IMul %127, %5 : i32
%130 = spv.IAdd %128, %129 : i32
%131 = spv.AccessChain %13[%6, %130] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%132 = spv.Load "StorageBuffer" %131 : f32
%133 = spv.IMul %126, %11 : i32
%134 = spv.IMul %127, %5 : i32
%135 = spv.IAdd %133, %134 : i32
%136 = spv.IAdd %135, %4 : i32
%137 = spv.AccessChain %13[%6, %136] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%138 = spv.Load "StorageBuffer" %137 : f32
%139 = spv.IMul %126, %11 : i32
%140 = spv.IMul %127, %5 : i32
%141 = spv.IAdd %139, %140 : i32
%142 = spv.IAdd %141, %2 : i32
%143 = spv.AccessChain %13[%6, %142] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%144 = spv.Load "StorageBuffer" %143 : f32
%145 = spv.CompositeConstruct %132, %138, %144 : vector<3xf32>
%146 = spv.CompositeExtract %145[0 : i32] : vector<3xf32>
%147 = spv.CompositeConstruct %146, %146, %146, %146 : vector<4xf32>
%148 = spv.FMul %147, %109 : vector<4xf32>
%149 = spv.FAdd %148, %97 : vector<4xf32>
%150 = spv.CompositeExtract %145[1 : i32] : vector<3xf32>
%151 = spv.CompositeConstruct %150, %150, %150, %150 : vector<4xf32>
%152 = spv.FMul %151, %116 : vector<4xf32>
%153 = spv.FAdd %152, %149 : vector<4xf32>
%154 = spv.CompositeExtract %145[2 : i32] : vector<3xf32>
%155 = spv.CompositeConstruct %154, %154, %154, %154 : vector<4xf32>
%156 = spv.FMul %155, %123 : vector<4xf32>
%157 = spv.FAdd %156, %153 : vector<4xf32>
%158 = spv.IAdd %81, %2 : i32
%159 = spv.IAdd %41, %158 : i32
%160 = spv.IAdd %28, %159 : i32
%161 = spv.IMul %160, %11 : i32
%162 = spv.IMul %127, %5 : i32
%163 = spv.IAdd %161, %162 : i32
%164 = spv.AccessChain %13[%6, %163] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%165 = spv.Load "StorageBuffer" %164 : f32
%166 = spv.IMul %160, %11 : i32
%167 = spv.IMul %127, %5 : i32
%168 = spv.IAdd %166, %167 : i32
%169 = spv.IAdd %168, %4 : i32
%170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.IMul %160, %11 : i32
%173 = spv.IMul %127, %5 : i32
%174 = spv.IAdd %172, %173 : i32
%175 = spv.IAdd %174, %2 : i32
%176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeConstruct %165, %171, %177 : vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.FMul %180, %109 : vector<4xf32>
%182 = spv.FAdd %181, %98 : vector<4xf32>
%183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
%185 = spv.FMul %184, %116 : vector<4xf32>
%186 = spv.FAdd %185, %182 : vector<4xf32>
%187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
%189 = spv.FMul %188, %123 : vector<4xf32>
%190 = spv.FAdd %189, %186 : vector<4xf32>
%191 = spv.IAdd %81, %1 : i32
%192 = spv.IAdd %41, %191 : i32
%193 = spv.IAdd %28, %192 : i32
%194 = spv.IMul %193, %11 : i32
%195 = spv.IMul %127, %5 : i32
%196 = spv.IAdd %194, %195 : i32
%197 = spv.AccessChain %13[%6, %196] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%198 = spv.Load "StorageBuffer" %197 : f32
%199 = spv.IMul %193, %11 : i32
%200 = spv.IMul %127, %5 : i32
%201 = spv.IAdd %199, %200 : i32
%202 = spv.IAdd %201, %4 : i32
%203 = spv.AccessChain %13[%6, %202] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%204 = spv.Load "StorageBuffer" %203 : f32
%205 = spv.IMul %193, %11 : i32
%206 = spv.IMul %127, %5 : i32
%207 = spv.IAdd %205, %206 : i32
%208 = spv.IAdd %207, %2 : i32
%209 = spv.AccessChain %13[%6, %208] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%210 = spv.Load "StorageBuffer" %209 : f32
%211 = spv.CompositeConstruct %198, %204, %210 : vector<3xf32>
%212 = spv.CompositeExtract %211[0 : i32] : vector<3xf32>
%213 = spv.CompositeConstruct %212, %212, %212, %212 : vector<4xf32>
%214 = spv.FMul %213, %109 : vector<4xf32>
%215 = spv.FAdd %214, %99 : vector<4xf32>
%216 = spv.CompositeExtract %211[1 : i32] : vector<3xf32>
%217 = spv.CompositeConstruct %216, %216, %216, %216 : vector<4xf32>
%218 = spv.FMul %217, %116 : vector<4xf32>
%219 = spv.FAdd %218, %215 : vector<4xf32>
%220 = spv.CompositeExtract %211[2 : i32] : vector<3xf32>
%221 = spv.CompositeConstruct %220, %220, %220, %220 : vector<4xf32>
%222 = spv.FMul %221, %123 : vector<4xf32>
%223 = spv.FAdd %222, %219 : vector<4xf32>
%224 = spv.IAdd %81, %3 : i32
%225 = spv.IAdd %41, %224 : i32
%226 = spv.IAdd %28, %225 : i32
%227 = spv.IMul %226, %11 : i32
%228 = spv.IMul %127, %5 : i32
%229 = spv.IAdd %227, %228 : i32
%230 = spv.AccessChain %13[%6, %229] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%231 = spv.Load "StorageBuffer" %230 : f32
%232 = spv.IMul %226, %11 : i32
%233 = spv.IMul %127, %5 : i32
%234 = spv.IAdd %232, %233 : i32
%235 = spv.IAdd %234, %4 : i32
%236 = spv.AccessChain %13[%6, %235] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%237 = spv.Load "StorageBuffer" %236 : f32
%238 = spv.IMul %226, %11 : i32
%239 = spv.IMul %127, %5 : i32
%240 = spv.IAdd %238, %239 : i32
%241 = spv.IAdd %240, %2 : i32
%242 = spv.AccessChain %13[%6, %241] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%243 = spv.Load "StorageBuffer" %242 : f32
%244 = spv.CompositeConstruct %231, %237, %243 : vector<3xf32>
%245 = spv.CompositeExtract %244[0 : i32] : vector<3xf32>
%246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
%247 = spv.FMul %246, %109 : vector<4xf32>
%248 = spv.FAdd %247, %100 : vector<4xf32>
%249 = spv.CompositeExtract %244[1 : i32] : vector<3xf32>
%250 = spv.CompositeConstruct %249, %249, %249, %249 : vector<4xf32>
%251 = spv.FMul %250, %116 : vector<4xf32>
%252 = spv.FAdd %251, %248 : vector<4xf32>
%253 = spv.CompositeExtract %244[2 : i32] : vector<3xf32>
%254 = spv.CompositeConstruct %253, %253, %253, %253 : vector<4xf32>
%255 = spv.FMul %254, %123 : vector<4xf32>
%256 = spv.FAdd %255, %252 : vector<4xf32>
spv.Store "Function" %87, %157 : vector<4xf32>
spv.Store "Function" %88, %190 : vector<4xf32>
spv.Store "Function" %89, %223 : vector<4xf32>
spv.Store "Function" %90, %256 : vector<4xf32>
%257 = spv.IAdd %96, %4 : i32
spv.Branch ^bb1(%257, %157, %190, %223, %256 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%91 = spv.Load "Function" %90 : vector<4xf32>
%92 = spv.Load "Function" %89 : vector<4xf32>
%93 = spv.Load "Function" %88 : vector<4xf32>
%94 = spv.Load "Function" %87 : vector<4xf32>
spv.Store "Function" %43, %94 : vector<4xf32>
spv.Store "Function" %44, %93 : vector<4xf32>
spv.Store "Function" %45, %92 : vector<4xf32>
spv.Store "Function" %46, %91 : vector<4xf32>
%95 = spv.IAdd %81, %4 : i32
spv.Branch ^bb1(%95, %94, %93, %92, %91 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%47 = spv.Load "Function" %46 : vector<4xf32>
%48 = spv.Load "Function" %45 : vector<4xf32>
%49 = spv.Load "Function" %44 : vector<4xf32>
%50 = spv.Load "Function" %43 : vector<4xf32>
%51 = spv.IAdd %39, %5 : i32
%52 = spv.IAdd %25, %51 : i32
%53 = spv.IAdd %26, %35 : i32
%54 = spv.IAdd %27, %40 : i32
%55 = spv.SDiv %54, %1 : i32
%56 = spv.IMul %52, %12 : i32
%57 = spv.IMul %53, %8 : i32
%58 = spv.IAdd %56, %57 : i32
%59 = spv.IAdd %58, %55 : i32
%60 = spv.AccessChain %15[%6, %59] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %60, %47 : vector<4xf32>
%61 = spv.IAdd %39, %2 : i32
%62 = spv.IAdd %25, %61 : i32
%63 = spv.IMul %62, %12 : i32
%64 = spv.IMul %53, %8 : i32
%65 = spv.IAdd %63, %64 : i32
%66 = spv.IAdd %65, %55 : i32
%67 = spv.AccessChain %15[%6, %66] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %67, %48 : vector<4xf32>
%68 = spv.IAdd %39, %4 : i32
%69 = spv.IAdd %25, %68 : i32
%70 = spv.IMul %69, %12 : i32
%71 = spv.IMul %53, %8 : i32
%72 = spv.IAdd %70, %71 : i32
%73 = spv.IAdd %72, %55 : i32
%74 = spv.AccessChain %15[%6, %73] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %74, %49 : vector<4xf32>
%75 = spv.IAdd %25, %39 : i32
%76 = spv.IMul %75, %12 : i32
%77 = spv.IMul %53, %8 : i32
%78 = spv.IAdd %76, %77 : i32
%79 = spv.IAdd %78, %55 : i32
%80 = spv.AccessChain %15[%6, %79] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %80, %50 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
// *** IR Dump After CSE ***
spv.module Logical GLSL450 {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 1 : i32
%5 = spv.Constant 3 : i32
%6 = spv.Constant 0 : i32
%7 = spv.Constant 16 : i32
%8 = spv.Constant 8 : i32
%9 = spv.Constant 72 : i32
%10 = spv.Constant 24 : i32
%11 = spv.Constant 675 : i32
%12 = spv.Constant 896 : i32
%13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
%19 = spv.Load "Input" %16 : vector<3xi32>
%20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
%21 = spv.Load "Input" %16 : vector<3xi32>
%22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
%23 = spv.IMul %22, %1 : i32
%24 = spv.IMul %20, %1 : i32
%25 = spv.IMul %18, %7 : i32
%26 = spv.IMul %22, %8 : i32
%27 = spv.IMul %20, %8 : i32
%28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%29 = spv.Load "Input" %28 : vector<3xi32>
%30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
%31 = spv.Load "Input" %28 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.Load "Input" %28 : vector<3xi32>
%34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
%35 = spv.IMul %34, %1 : i32
%36 = spv.IMul %30, %1 : i32
%37 = spv.IMul %34, %8 : i32
%38 = spv.IMul %32, %2 : i32
%39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%79 = spv.SLessThan %74, %5 : i32
spv.BranchConditional %79, ^bb2, ^bb3
^bb2: // pred: ^bb1
%80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%94 = spv.SLessThan %89, %5 : i32
spv.BranchConditional %94, ^bb2, ^bb3
^bb2: // pred: ^bb1
%95 = spv.IAdd %25, %36 : i32
%96 = spv.SDiv %95, %1 : i32
%97 = spv.IMul %74, %9 : i32
%98 = spv.IMul %89, %10 : i32
%99 = spv.IAdd %97, %98 : i32
%100 = spv.IAdd %99, %96 : i32
%101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
%103 = spv.IAdd %99, %8 : i32
%104 = spv.IAdd %103, %96 : i32
%105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
%107 = spv.IAdd %99, %7 : i32
%108 = spv.IAdd %107, %96 : i32
%109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%111 = spv.IAdd %37, %74 : i32
%112 = spv.IAdd %38, %89 : i32
%113 = spv.IAdd %26, %111 : i32
%114 = spv.IAdd %27, %112 : i32
%115 = spv.IMul %113, %11 : i32
%116 = spv.IMul %114, %5 : i32
%117 = spv.IAdd %115, %116 : i32
%118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : f32
%120 = spv.IAdd %117, %4 : i32
%121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%122 = spv.Load "StorageBuffer" %121 : f32
%123 = spv.IAdd %117, %2 : i32
%124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
%127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
%128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
%129 = spv.FMul %128, %102 : vector<4xf32>
%130 = spv.FAdd %129, %90 : vector<4xf32>
%131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.FMul %132, %106 : vector<4xf32>
%134 = spv.FAdd %133, %130 : vector<4xf32>
%135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
%136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
%137 = spv.FMul %136, %110 : vector<4xf32>
%138 = spv.FAdd %137, %134 : vector<4xf32>
%139 = spv.IAdd %74, %2 : i32
%140 = spv.IAdd %37, %139 : i32
%141 = spv.IAdd %26, %140 : i32
%142 = spv.IMul %141, %11 : i32
%143 = spv.IAdd %142, %116 : i32
%144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%145 = spv.Load "StorageBuffer" %144 : f32
%146 = spv.IAdd %143, %4 : i32
%147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%148 = spv.Load "StorageBuffer" %147 : f32
%149 = spv.IAdd %143, %2 : i32
%150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%151 = spv.Load "StorageBuffer" %150 : f32
%152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
%153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
%154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
%155 = spv.FMul %154, %102 : vector<4xf32>
%156 = spv.FAdd %155, %91 : vector<4xf32>
%157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
%158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
%159 = spv.FMul %158, %106 : vector<4xf32>
%160 = spv.FAdd %159, %156 : vector<4xf32>
%161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
%162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
%163 = spv.FMul %162, %110 : vector<4xf32>
%164 = spv.FAdd %163, %160 : vector<4xf32>
%165 = spv.IAdd %74, %1 : i32
%166 = spv.IAdd %37, %165 : i32
%167 = spv.IAdd %26, %166 : i32
%168 = spv.IMul %167, %11 : i32
%169 = spv.IAdd %168, %116 : i32
%170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.IAdd %169, %4 : i32
%173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%174 = spv.Load "StorageBuffer" %173 : f32
%175 = spv.IAdd %169, %2 : i32
%176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.FMul %180, %102 : vector<4xf32>
%182 = spv.FAdd %181, %92 : vector<4xf32>
%183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
%185 = spv.FMul %184, %106 : vector<4xf32>
%186 = spv.FAdd %185, %182 : vector<4xf32>
%187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
%189 = spv.FMul %188, %110 : vector<4xf32>
%190 = spv.FAdd %189, %186 : vector<4xf32>
%191 = spv.IAdd %74, %3 : i32
%192 = spv.IAdd %37, %191 : i32
%193 = spv.IAdd %26, %192 : i32
%194 = spv.IMul %193, %11 : i32
%195 = spv.IAdd %194, %116 : i32
%196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%197 = spv.Load "StorageBuffer" %196 : f32
%198 = spv.IAdd %195, %4 : i32
%199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%200 = spv.Load "StorageBuffer" %199 : f32
%201 = spv.IAdd %195, %2 : i32
%202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
%205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
%206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
%207 = spv.FMul %206, %102 : vector<4xf32>
%208 = spv.FAdd %207, %93 : vector<4xf32>
%209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
%210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
%211 = spv.FMul %210, %106 : vector<4xf32>
%212 = spv.FAdd %211, %208 : vector<4xf32>
%213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
%214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
%215 = spv.FMul %214, %110 : vector<4xf32>
%216 = spv.FAdd %215, %212 : vector<4xf32>
spv.Store "Function" %80, %138 : vector<4xf32>
spv.Store "Function" %81, %164 : vector<4xf32>
spv.Store "Function" %82, %190 : vector<4xf32>
spv.Store "Function" %83, %216 : vector<4xf32>
%217 = spv.IAdd %89, %4 : i32
spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%84 = spv.Load "Function" %83 : vector<4xf32>
%85 = spv.Load "Function" %82 : vector<4xf32>
%86 = spv.Load "Function" %81 : vector<4xf32>
%87 = spv.Load "Function" %80 : vector<4xf32>
spv.Store "Function" %39, %87 : vector<4xf32>
spv.Store "Function" %40, %86 : vector<4xf32>
spv.Store "Function" %41, %85 : vector<4xf32>
spv.Store "Function" %42, %84 : vector<4xf32>
%88 = spv.IAdd %74, %4 : i32
spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%43 = spv.Load "Function" %42 : vector<4xf32>
%44 = spv.Load "Function" %41 : vector<4xf32>
%45 = spv.Load "Function" %40 : vector<4xf32>
%46 = spv.Load "Function" %39 : vector<4xf32>
%47 = spv.IAdd %35, %5 : i32
%48 = spv.IAdd %23, %47 : i32
%49 = spv.IAdd %24, %32 : i32
%50 = spv.IAdd %25, %36 : i32
%51 = spv.SDiv %50, %1 : i32
%52 = spv.IMul %48, %12 : i32
%53 = spv.IMul %49, %8 : i32
%54 = spv.IAdd %52, %53 : i32
%55 = spv.IAdd %54, %51 : i32
%56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
%57 = spv.IAdd %35, %2 : i32
%58 = spv.IAdd %23, %57 : i32
%59 = spv.IMul %58, %12 : i32
%60 = spv.IAdd %59, %53 : i32
%61 = spv.IAdd %60, %51 : i32
%62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
%63 = spv.IAdd %35, %4 : i32
%64 = spv.IAdd %23, %63 : i32
%65 = spv.IMul %64, %12 : i32
%66 = spv.IAdd %65, %53 : i32
%67 = spv.IAdd %66, %51 : i32
%68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
%69 = spv.IAdd %23, %35 : i32
%70 = spv.IMul %69, %12 : i32
%71 = spv.IAdd %70, %53 : i32
%72 = spv.IAdd %71, %51 : i32
%73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
// *** IR Dump After SPIRVUpdateVCE ***
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 1 : i32
%5 = spv.Constant 3 : i32
%6 = spv.Constant 0 : i32
%7 = spv.Constant 16 : i32
%8 = spv.Constant 8 : i32
%9 = spv.Constant 72 : i32
%10 = spv.Constant 24 : i32
%11 = spv.Constant 675 : i32
%12 = spv.Constant 896 : i32
%13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
%19 = spv.Load "Input" %16 : vector<3xi32>
%20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
%21 = spv.Load "Input" %16 : vector<3xi32>
%22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
%23 = spv.IMul %22, %1 : i32
%24 = spv.IMul %20, %1 : i32
%25 = spv.IMul %18, %7 : i32
%26 = spv.IMul %22, %8 : i32
%27 = spv.IMul %20, %8 : i32
%28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%29 = spv.Load "Input" %28 : vector<3xi32>
%30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
%31 = spv.Load "Input" %28 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.Load "Input" %28 : vector<3xi32>
%34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
%35 = spv.IMul %34, %1 : i32
%36 = spv.IMul %30, %1 : i32
%37 = spv.IMul %34, %8 : i32
%38 = spv.IMul %32, %2 : i32
%39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%79 = spv.SLessThan %74, %5 : i32
spv.BranchConditional %79, ^bb2, ^bb3
^bb2: // pred: ^bb1
%80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%94 = spv.SLessThan %89, %5 : i32
spv.BranchConditional %94, ^bb2, ^bb3
^bb2: // pred: ^bb1
%95 = spv.IAdd %25, %36 : i32
%96 = spv.SDiv %95, %1 : i32
%97 = spv.IMul %74, %9 : i32
%98 = spv.IMul %89, %10 : i32
%99 = spv.IAdd %97, %98 : i32
%100 = spv.IAdd %99, %96 : i32
%101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
%103 = spv.IAdd %99, %8 : i32
%104 = spv.IAdd %103, %96 : i32
%105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
%107 = spv.IAdd %99, %7 : i32
%108 = spv.IAdd %107, %96 : i32
%109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%111 = spv.IAdd %37, %74 : i32
%112 = spv.IAdd %38, %89 : i32
%113 = spv.IAdd %26, %111 : i32
%114 = spv.IAdd %27, %112 : i32
%115 = spv.IMul %113, %11 : i32
%116 = spv.IMul %114, %5 : i32
%117 = spv.IAdd %115, %116 : i32
%118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : f32
%120 = spv.IAdd %117, %4 : i32
%121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%122 = spv.Load "StorageBuffer" %121 : f32
%123 = spv.IAdd %117, %2 : i32
%124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
%127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
%128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
%129 = spv.FMul %128, %102 : vector<4xf32>
%130 = spv.FAdd %129, %90 : vector<4xf32>
%131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.FMul %132, %106 : vector<4xf32>
%134 = spv.FAdd %133, %130 : vector<4xf32>
%135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
%136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
%137 = spv.FMul %136, %110 : vector<4xf32>
%138 = spv.FAdd %137, %134 : vector<4xf32>
%139 = spv.IAdd %74, %2 : i32
%140 = spv.IAdd %37, %139 : i32
%141 = spv.IAdd %26, %140 : i32
%142 = spv.IMul %141, %11 : i32
%143 = spv.IAdd %142, %116 : i32
%144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%145 = spv.Load "StorageBuffer" %144 : f32
%146 = spv.IAdd %143, %4 : i32
%147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%148 = spv.Load "StorageBuffer" %147 : f32
%149 = spv.IAdd %143, %2 : i32
%150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%151 = spv.Load "StorageBuffer" %150 : f32
%152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
%153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
%154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
%155 = spv.FMul %154, %102 : vector<4xf32>
%156 = spv.FAdd %155, %91 : vector<4xf32>
%157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
%158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
%159 = spv.FMul %158, %106 : vector<4xf32>
%160 = spv.FAdd %159, %156 : vector<4xf32>
%161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
%162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
%163 = spv.FMul %162, %110 : vector<4xf32>
%164 = spv.FAdd %163, %160 : vector<4xf32>
%165 = spv.IAdd %74, %1 : i32
%166 = spv.IAdd %37, %165 : i32
%167 = spv.IAdd %26, %166 : i32
%168 = spv.IMul %167, %11 : i32
%169 = spv.IAdd %168, %116 : i32
%170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.IAdd %169, %4 : i32
%173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%174 = spv.Load "StorageBuffer" %173 : f32
%175 = spv.IAdd %169, %2 : i32
%176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.FMul %180, %102 : vector<4xf32>
%182 = spv.FAdd %181, %92 : vector<4xf32>
%183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
%185 = spv.FMul %184, %106 : vector<4xf32>
%186 = spv.FAdd %185, %182 : vector<4xf32>
%187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
%189 = spv.FMul %188, %110 : vector<4xf32>
%190 = spv.FAdd %189, %186 : vector<4xf32>
%191 = spv.IAdd %74, %3 : i32
%192 = spv.IAdd %37, %191 : i32
%193 = spv.IAdd %26, %192 : i32
%194 = spv.IMul %193, %11 : i32
%195 = spv.IAdd %194, %116 : i32
%196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%197 = spv.Load "StorageBuffer" %196 : f32
%198 = spv.IAdd %195, %4 : i32
%199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%200 = spv.Load "StorageBuffer" %199 : f32
%201 = spv.IAdd %195, %2 : i32
%202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
%205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
%206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
%207 = spv.FMul %206, %102 : vector<4xf32>
%208 = spv.FAdd %207, %93 : vector<4xf32>
%209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
%210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
%211 = spv.FMul %210, %106 : vector<4xf32>
%212 = spv.FAdd %211, %208 : vector<4xf32>
%213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
%214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
%215 = spv.FMul %214, %110 : vector<4xf32>
%216 = spv.FAdd %215, %212 : vector<4xf32>
spv.Store "Function" %80, %138 : vector<4xf32>
spv.Store "Function" %81, %164 : vector<4xf32>
spv.Store "Function" %82, %190 : vector<4xf32>
spv.Store "Function" %83, %216 : vector<4xf32>
%217 = spv.IAdd %89, %4 : i32
spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%84 = spv.Load "Function" %83 : vector<4xf32>
%85 = spv.Load "Function" %82 : vector<4xf32>
%86 = spv.Load "Function" %81 : vector<4xf32>
%87 = spv.Load "Function" %80 : vector<4xf32>
spv.Store "Function" %39, %87 : vector<4xf32>
spv.Store "Function" %40, %86 : vector<4xf32>
spv.Store "Function" %41, %85 : vector<4xf32>
spv.Store "Function" %42, %84 : vector<4xf32>
%88 = spv.IAdd %74, %4 : i32
spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%43 = spv.Load "Function" %42 : vector<4xf32>
%44 = spv.Load "Function" %41 : vector<4xf32>
%45 = spv.Load "Function" %40 : vector<4xf32>
%46 = spv.Load "Function" %39 : vector<4xf32>
%47 = spv.IAdd %35, %5 : i32
%48 = spv.IAdd %23, %47 : i32
%49 = spv.IAdd %24, %32 : i32
%50 = spv.IAdd %25, %36 : i32
%51 = spv.SDiv %50, %1 : i32
%52 = spv.IMul %48, %12 : i32
%53 = spv.IMul %49, %8 : i32
%54 = spv.IAdd %52, %53 : i32
%55 = spv.IAdd %54, %51 : i32
%56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
%57 = spv.IAdd %35, %2 : i32
%58 = spv.IAdd %23, %57 : i32
%59 = spv.IMul %58, %12 : i32
%60 = spv.IAdd %59, %53 : i32
%61 = spv.IAdd %60, %51 : i32
%62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
%63 = spv.IAdd %35, %4 : i32
%64 = spv.IAdd %23, %63 : i32
%65 = spv.IMul %64, %12 : i32
%66 = spv.IAdd %65, %53 : i32
%67 = spv.IAdd %66, %51 : i32
%68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
%69 = spv.IAdd %23, %35 : i32
%70 = spv.IMul %69, %12 : i32
%71 = spv.IAdd %70, %53 : i32
%72 = spv.IAdd %71, %51 : i32
%73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass ***
hal.executable.target @vulkan_spirv, filter="vulkan*" {
hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c2 = constant 2 : index
%c28 = constant 28 : index
%c28_0 = constant 28 : index
hal.return %c2, %c28, %c28_0 : index, index, index
}
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 1 : i32
%5 = spv.Constant 3 : i32
%6 = spv.Constant 0 : i32
%7 = spv.Constant 16 : i32
%8 = spv.Constant 8 : i32
%9 = spv.Constant 72 : i32
%10 = spv.Constant 24 : i32
%11 = spv.Constant 675 : i32
%12 = spv.Constant 896 : i32
%13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
%19 = spv.Load "Input" %16 : vector<3xi32>
%20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
%21 = spv.Load "Input" %16 : vector<3xi32>
%22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
%23 = spv.IMul %22, %1 : i32
%24 = spv.IMul %20, %1 : i32
%25 = spv.IMul %18, %7 : i32
%26 = spv.IMul %22, %8 : i32
%27 = spv.IMul %20, %8 : i32
%28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%29 = spv.Load "Input" %28 : vector<3xi32>
%30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
%31 = spv.Load "Input" %28 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.Load "Input" %28 : vector<3xi32>
%34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
%35 = spv.IMul %34, %1 : i32
%36 = spv.IMul %30, %1 : i32
%37 = spv.IMul %34, %8 : i32
%38 = spv.IMul %32, %2 : i32
%39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%79 = spv.SLessThan %74, %5 : i32
spv.BranchConditional %79, ^bb2, ^bb3
^bb2: // pred: ^bb1
%80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%94 = spv.SLessThan %89, %5 : i32
spv.BranchConditional %94, ^bb2, ^bb3
^bb2: // pred: ^bb1
%95 = spv.IAdd %25, %36 : i32
%96 = spv.SDiv %95, %1 : i32
%97 = spv.IMul %74, %9 : i32
%98 = spv.IMul %89, %10 : i32
%99 = spv.IAdd %97, %98 : i32
%100 = spv.IAdd %99, %96 : i32
%101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
%103 = spv.IAdd %99, %8 : i32
%104 = spv.IAdd %103, %96 : i32
%105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
%107 = spv.IAdd %99, %7 : i32
%108 = spv.IAdd %107, %96 : i32
%109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%111 = spv.IAdd %37, %74 : i32
%112 = spv.IAdd %38, %89 : i32
%113 = spv.IAdd %26, %111 : i32
%114 = spv.IAdd %27, %112 : i32
%115 = spv.IMul %113, %11 : i32
%116 = spv.IMul %114, %5 : i32
%117 = spv.IAdd %115, %116 : i32
%118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : f32
%120 = spv.IAdd %117, %4 : i32
%121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%122 = spv.Load "StorageBuffer" %121 : f32
%123 = spv.IAdd %117, %2 : i32
%124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
%127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
%128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
%129 = spv.FMul %128, %102 : vector<4xf32>
%130 = spv.FAdd %129, %90 : vector<4xf32>
%131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.FMul %132, %106 : vector<4xf32>
%134 = spv.FAdd %133, %130 : vector<4xf32>
%135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
%136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
%137 = spv.FMul %136, %110 : vector<4xf32>
%138 = spv.FAdd %137, %134 : vector<4xf32>
%139 = spv.IAdd %74, %2 : i32
%140 = spv.IAdd %37, %139 : i32
%141 = spv.IAdd %26, %140 : i32
%142 = spv.IMul %141, %11 : i32
%143 = spv.IAdd %142, %116 : i32
%144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%145 = spv.Load "StorageBuffer" %144 : f32
%146 = spv.IAdd %143, %4 : i32
%147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%148 = spv.Load "StorageBuffer" %147 : f32
%149 = spv.IAdd %143, %2 : i32
%150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%151 = spv.Load "StorageBuffer" %150 : f32
%152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
%153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
%154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
%155 = spv.FMul %154, %102 : vector<4xf32>
%156 = spv.FAdd %155, %91 : vector<4xf32>
%157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
%158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
%159 = spv.FMul %158, %106 : vector<4xf32>
%160 = spv.FAdd %159, %156 : vector<4xf32>
%161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
%162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
%163 = spv.FMul %162, %110 : vector<4xf32>
%164 = spv.FAdd %163, %160 : vector<4xf32>
%165 = spv.IAdd %74, %1 : i32
%166 = spv.IAdd %37, %165 : i32
%167 = spv.IAdd %26, %166 : i32
%168 = spv.IMul %167, %11 : i32
%169 = spv.IAdd %168, %116 : i32
%170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.IAdd %169, %4 : i32
%173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%174 = spv.Load "StorageBuffer" %173 : f32
%175 = spv.IAdd %169, %2 : i32
%176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.FMul %180, %102 : vector<4xf32>
%182 = spv.FAdd %181, %92 : vector<4xf32>
%183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
%185 = spv.FMul %184, %106 : vector<4xf32>
%186 = spv.FAdd %185, %182 : vector<4xf32>
%187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
%189 = spv.FMul %188, %110 : vector<4xf32>
%190 = spv.FAdd %189, %186 : vector<4xf32>
%191 = spv.IAdd %74, %3 : i32
%192 = spv.IAdd %37, %191 : i32
%193 = spv.IAdd %26, %192 : i32
%194 = spv.IMul %193, %11 : i32
%195 = spv.IAdd %194, %116 : i32
%196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%197 = spv.Load "StorageBuffer" %196 : f32
%198 = spv.IAdd %195, %4 : i32
%199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%200 = spv.Load "StorageBuffer" %199 : f32
%201 = spv.IAdd %195, %2 : i32
%202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
%205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
%206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
%207 = spv.FMul %206, %102 : vector<4xf32>
%208 = spv.FAdd %207, %93 : vector<4xf32>
%209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
%210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
%211 = spv.FMul %210, %106 : vector<4xf32>
%212 = spv.FAdd %211, %208 : vector<4xf32>
%213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
%214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
%215 = spv.FMul %214, %110 : vector<4xf32>
%216 = spv.FAdd %215, %212 : vector<4xf32>
spv.Store "Function" %80, %138 : vector<4xf32>
spv.Store "Function" %81, %164 : vector<4xf32>
spv.Store "Function" %82, %190 : vector<4xf32>
spv.Store "Function" %83, %216 : vector<4xf32>
%217 = spv.IAdd %89, %4 : i32
spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%84 = spv.Load "Function" %83 : vector<4xf32>
%85 = spv.Load "Function" %82 : vector<4xf32>
%86 = spv.Load "Function" %81 : vector<4xf32>
%87 = spv.Load "Function" %80 : vector<4xf32>
spv.Store "Function" %39, %87 : vector<4xf32>
spv.Store "Function" %40, %86 : vector<4xf32>
spv.Store "Function" %41, %85 : vector<4xf32>
spv.Store "Function" %42, %84 : vector<4xf32>
%88 = spv.IAdd %74, %4 : i32
spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%43 = spv.Load "Function" %42 : vector<4xf32>
%44 = spv.Load "Function" %41 : vector<4xf32>
%45 = spv.Load "Function" %40 : vector<4xf32>
%46 = spv.Load "Function" %39 : vector<4xf32>
%47 = spv.IAdd %35, %5 : i32
%48 = spv.IAdd %23, %47 : i32
%49 = spv.IAdd %24, %32 : i32
%50 = spv.IAdd %25, %36 : i32
%51 = spv.SDiv %50, %1 : i32
%52 = spv.IMul %48, %12 : i32
%53 = spv.IMul %49, %8 : i32
%54 = spv.IAdd %52, %53 : i32
%55 = spv.IAdd %54, %51 : i32
%56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
%57 = spv.IAdd %35, %2 : i32
%58 = spv.IAdd %23, %57 : i32
%59 = spv.IMul %58, %12 : i32
%60 = spv.IAdd %59, %53 : i32
%61 = spv.IAdd %60, %51 : i32
%62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
%63 = spv.IAdd %35, %4 : i32
%64 = spv.IAdd %23, %63 : i32
%65 = spv.IMul %64, %12 : i32
%66 = spv.IAdd %65, %53 : i32
%67 = spv.IAdd %66, %51 : i32
%68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
%69 = spv.IAdd %23, %35 : i32
%70 = spv.IMul %69, %12 : i32
%71 = spv.IAdd %70, %53 : i32
%72 = spv.IAdd %71, %51 : i32
%73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::ConvertToHALPass ***
module {
hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
hal.interface @legacy_io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.target @vulkan_spirv, filter="vulkan*" {
hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c2 = constant 2 : index
%c28 = constant 28 : index
%c28_0 = constant 28 : index
hal.return %c2, %c28, %c28_0 : index, index, index
}
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 1 : i32
%5 = spv.Constant 3 : i32
%6 = spv.Constant 0 : i32
%7 = spv.Constant 16 : i32
%8 = spv.Constant 8 : i32
%9 = spv.Constant 72 : i32
%10 = spv.Constant 24 : i32
%11 = spv.Constant 675 : i32
%12 = spv.Constant 896 : i32
%13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
%19 = spv.Load "Input" %16 : vector<3xi32>
%20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
%21 = spv.Load "Input" %16 : vector<3xi32>
%22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
%23 = spv.IMul %22, %1 : i32
%24 = spv.IMul %20, %1 : i32
%25 = spv.IMul %18, %7 : i32
%26 = spv.IMul %22, %8 : i32
%27 = spv.IMul %20, %8 : i32
%28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%29 = spv.Load "Input" %28 : vector<3xi32>
%30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
%31 = spv.Load "Input" %28 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.Load "Input" %28 : vector<3xi32>
%34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
%35 = spv.IMul %34, %1 : i32
%36 = spv.IMul %30, %1 : i32
%37 = spv.IMul %34, %8 : i32
%38 = spv.IMul %32, %2 : i32
%39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%79 = spv.SLessThan %74, %5 : i32
spv.BranchConditional %79, ^bb2, ^bb3
^bb2: // pred: ^bb1
%80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%94 = spv.SLessThan %89, %5 : i32
spv.BranchConditional %94, ^bb2, ^bb3
^bb2: // pred: ^bb1
%95 = spv.IAdd %25, %36 : i32
%96 = spv.SDiv %95, %1 : i32
%97 = spv.IMul %74, %9 : i32
%98 = spv.IMul %89, %10 : i32
%99 = spv.IAdd %97, %98 : i32
%100 = spv.IAdd %99, %96 : i32
%101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
%103 = spv.IAdd %99, %8 : i32
%104 = spv.IAdd %103, %96 : i32
%105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
%107 = spv.IAdd %99, %7 : i32
%108 = spv.IAdd %107, %96 : i32
%109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%111 = spv.IAdd %37, %74 : i32
%112 = spv.IAdd %38, %89 : i32
%113 = spv.IAdd %26, %111 : i32
%114 = spv.IAdd %27, %112 : i32
%115 = spv.IMul %113, %11 : i32
%116 = spv.IMul %114, %5 : i32
%117 = spv.IAdd %115, %116 : i32
%118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : f32
%120 = spv.IAdd %117, %4 : i32
%121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%122 = spv.Load "StorageBuffer" %121 : f32
%123 = spv.IAdd %117, %2 : i32
%124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
%127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
%128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
%129 = spv.FMul %128, %102 : vector<4xf32>
%130 = spv.FAdd %129, %90 : vector<4xf32>
%131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.FMul %132, %106 : vector<4xf32>
%134 = spv.FAdd %133, %130 : vector<4xf32>
%135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
%136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
%137 = spv.FMul %136, %110 : vector<4xf32>
%138 = spv.FAdd %137, %134 : vector<4xf32>
%139 = spv.IAdd %74, %2 : i32
%140 = spv.IAdd %37, %139 : i32
%141 = spv.IAdd %26, %140 : i32
%142 = spv.IMul %141, %11 : i32
%143 = spv.IAdd %142, %116 : i32
%144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%145 = spv.Load "StorageBuffer" %144 : f32
%146 = spv.IAdd %143, %4 : i32
%147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%148 = spv.Load "StorageBuffer" %147 : f32
%149 = spv.IAdd %143, %2 : i32
%150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%151 = spv.Load "StorageBuffer" %150 : f32
%152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
%153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
%154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
%155 = spv.FMul %154, %102 : vector<4xf32>
%156 = spv.FAdd %155, %91 : vector<4xf32>
%157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
%158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
%159 = spv.FMul %158, %106 : vector<4xf32>
%160 = spv.FAdd %159, %156 : vector<4xf32>
%161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
%162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
%163 = spv.FMul %162, %110 : vector<4xf32>
%164 = spv.FAdd %163, %160 : vector<4xf32>
%165 = spv.IAdd %74, %1 : i32
%166 = spv.IAdd %37, %165 : i32
%167 = spv.IAdd %26, %166 : i32
%168 = spv.IMul %167, %11 : i32
%169 = spv.IAdd %168, %116 : i32
%170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.IAdd %169, %4 : i32
%173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%174 = spv.Load "StorageBuffer" %173 : f32
%175 = spv.IAdd %169, %2 : i32
%176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.FMul %180, %102 : vector<4xf32>
%182 = spv.FAdd %181, %92 : vector<4xf32>
%183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
%185 = spv.FMul %184, %106 : vector<4xf32>
%186 = spv.FAdd %185, %182 : vector<4xf32>
%187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
%189 = spv.FMul %188, %110 : vector<4xf32>
%190 = spv.FAdd %189, %186 : vector<4xf32>
%191 = spv.IAdd %74, %3 : i32
%192 = spv.IAdd %37, %191 : i32
%193 = spv.IAdd %26, %192 : i32
%194 = spv.IMul %193, %11 : i32
%195 = spv.IAdd %194, %116 : i32
%196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%197 = spv.Load "StorageBuffer" %196 : f32
%198 = spv.IAdd %195, %4 : i32
%199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%200 = spv.Load "StorageBuffer" %199 : f32
%201 = spv.IAdd %195, %2 : i32
%202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
%205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
%206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
%207 = spv.FMul %206, %102 : vector<4xf32>
%208 = spv.FAdd %207, %93 : vector<4xf32>
%209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
%210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
%211 = spv.FMul %210, %106 : vector<4xf32>
%212 = spv.FAdd %211, %208 : vector<4xf32>
%213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
%214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
%215 = spv.FMul %214, %110 : vector<4xf32>
%216 = spv.FAdd %215, %212 : vector<4xf32>
spv.Store "Function" %80, %138 : vector<4xf32>
spv.Store "Function" %81, %164 : vector<4xf32>
spv.Store "Function" %82, %190 : vector<4xf32>
spv.Store "Function" %83, %216 : vector<4xf32>
%217 = spv.IAdd %89, %4 : i32
spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%84 = spv.Load "Function" %83 : vector<4xf32>
%85 = spv.Load "Function" %82 : vector<4xf32>
%86 = spv.Load "Function" %81 : vector<4xf32>
%87 = spv.Load "Function" %80 : vector<4xf32>
spv.Store "Function" %39, %87 : vector<4xf32>
spv.Store "Function" %40, %86 : vector<4xf32>
spv.Store "Function" %41, %85 : vector<4xf32>
spv.Store "Function" %42, %84 : vector<4xf32>
%88 = spv.IAdd %74, %4 : i32
spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%43 = spv.Load "Function" %42 : vector<4xf32>
%44 = spv.Load "Function" %41 : vector<4xf32>
%45 = spv.Load "Function" %40 : vector<4xf32>
%46 = spv.Load "Function" %39 : vector<4xf32>
%47 = spv.IAdd %35, %5 : i32
%48 = spv.IAdd %23, %47 : i32
%49 = spv.IAdd %24, %32 : i32
%50 = spv.IAdd %25, %36 : i32
%51 = spv.SDiv %50, %1 : i32
%52 = spv.IMul %48, %12 : i32
%53 = spv.IMul %49, %8 : i32
%54 = spv.IAdd %52, %53 : i32
%55 = spv.IAdd %54, %51 : i32
%56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
%57 = spv.IAdd %35, %2 : i32
%58 = spv.IAdd %23, %57 : i32
%59 = spv.IMul %58, %12 : i32
%60 = spv.IAdd %59, %53 : i32
%61 = spv.IAdd %60, %51 : i32
%62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
%63 = spv.IAdd %35, %4 : i32
%64 = spv.IAdd %23, %63 : i32
%65 = spv.IMul %64, %12 : i32
%66 = spv.IAdd %65, %53 : i32
%67 = spv.IAdd %66, %51 : i32
%68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
%69 = spv.IAdd %23, %35 : i32
%70 = spv.IMul %69, %12 : i32
%71 = spv.IAdd %70, %53 : i32
%72 = spv.IAdd %71, %51 : i32
%73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%c1 = constant 1 : index
%c112 = constant 112 : index
%c112_0 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%sz = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c112, %c112_0, %c32]) type(%c50331680_i32) : index
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%sz}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%c112_1 = constant 112 : index
%c32_2 = constant 32 : index
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
%c0 = constant 0 : index
%c1_3 = constant 1 : index
%c225 = constant 225 : index
%c225_4 = constant 225 : index
%c3 = constant 3 : index
%allocator_5 = hal.buffer.allocator<%arg0 : !hal.buffer> : !hal.allocator
%c50331680_i32_6 = constant 50331680 : i32
%sz_7 = hal.allocator.compute_size<%allocator_5 : !hal.allocator> shape([%c1_3, %c225, %c225_4, %c3]) type(%c50331680_i32_6) : index
%c0_8 = constant 0 : index
%c3_9 = constant 3 : index
%c3_10 = constant 3 : index
%c3_11 = constant 3 : index
%c32_12 = constant 32 : index
%allocator_13 = hal.buffer.allocator<%arg1 : !hal.buffer> : !hal.allocator
%c50331680_i32_14 = constant 50331680 : i32
%sz_15 = hal.allocator.compute_size<%allocator_13 : !hal.allocator> shape([%c3_9, %c3_10, %c3_11, %c32_12]) type(%c50331680_i32_14) : index
%c1_16 = constant 1 : index
%c1_17 = constant 1 : index
%c112_18 = constant 112 : index
%c112_19 = constant 112 : index
%c32_20 = constant 32 : index
%allocator_21 = hal.buffer.allocator<%buffer : !hal.buffer> : !hal.allocator
%c50331680_i32_22 = constant 50331680 : i32
%sz_23 = hal.allocator.compute_size<%allocator_21 : !hal.allocator> shape([%c1_17, %c112_18, %c112_19, %c32_20]) type(%c50331680_i32_22) : index
%c2 = constant 2 : index
%c0_24 = constant 0 : index
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0_24] bindings([
%c0_8 = (%arg0 : !hal.buffer)[%c0, %sz_7],
%c1_16 = (%arg1 : !hal.buffer)[%c0, %sz_15],
%c2 = (%buffer : !hal.buffer)[%c0, %sz_23]
])
hal.device.switch<%device : !hal.device>
#hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32_2 : index, %arg4 = %c112_1 : index, %arg5 = %c112_1 : index) {
%c2_25 = constant 2 : index
%c28 = constant 28 : index
%c28_26 = constant 28 : index
hal.command_buffer.dispatch.symbol<%arg2 : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv::@conv_dispatch_0) workgroups([%c2_25, %c28, %c28_26])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
return %buffer : !hal.buffer
}
}
// *** IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionRankedShapeDimsPass ***
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%c1 = constant 1 : index
%c112 = constant 112 : index
%c112_0 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%sz = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c112, %c112_0, %c32]) type(%c50331680_i32) : index
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%sz}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%c112_1 = constant 112 : index
%c32_2 = constant 32 : index
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
%c0 = constant 0 : index
%c1_3 = constant 1 : index
%c225 = constant 225 : index
%c225_4 = constant 225 : index
%c3 = constant 3 : index
%allocator_5 = hal.buffer.allocator<%arg0 : !hal.buffer> : !hal.allocator
%c50331680_i32_6 = constant 50331680 : i32
%sz_7 = hal.allocator.compute_size<%allocator_5 : !hal.allocator> shape([%c1_3, %c225, %c225_4, %c3]) type(%c50331680_i32_6) : index
%c0_8 = constant 0 : index
%c3_9 = constant 3 : index
%c3_10 = constant 3 : index
%c3_11 = constant 3 : index
%c32_12 = constant 32 : index
%allocator_13 = hal.buffer.allocator<%arg1 : !hal.buffer> : !hal.allocator
%c50331680_i32_14 = constant 50331680 : i32
%sz_15 = hal.allocator.compute_size<%allocator_13 : !hal.allocator> shape([%c3_9, %c3_10, %c3_11, %c32_12]) type(%c50331680_i32_14) : index
%c1_16 = constant 1 : index
%c1_17 = constant 1 : index
%c112_18 = constant 112 : index
%c112_19 = constant 112 : index
%c32_20 = constant 32 : index
%allocator_21 = hal.buffer.allocator<%buffer : !hal.buffer> : !hal.allocator
%c50331680_i32_22 = constant 50331680 : i32
%sz_23 = hal.allocator.compute_size<%allocator_21 : !hal.allocator> shape([%c1_17, %c112_18, %c112_19, %c32_20]) type(%c50331680_i32_22) : index
%c2 = constant 2 : index
%c0_24 = constant 0 : index
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0_24] bindings([
%c0_8 = (%arg0 : !hal.buffer)[%c0, %sz_7],
%c1_16 = (%arg1 : !hal.buffer)[%c0, %sz_15],
%c2 = (%buffer : !hal.buffer)[%c0, %sz_23]
])
hal.device.switch<%device : !hal.device>
#hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32_2 : index, %arg4 = %c112_1 : index, %arg5 = %c112_1 : index) {
%c2_25 = constant 2 : index
%c28 = constant 28 : index
%c28_26 = constant 28 : index
hal.command_buffer.dispatch.symbol<%arg2 : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv::@conv_dispatch_0) workgroups([%c2_25, %c28, %c28_26])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
return %buffer : !hal.buffer
}
// *** IR Dump After Canonicalizer ***
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%c1605632 = constant 1605632 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
%c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
])
hal.device.switch<%device : !hal.device>
#hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32 : index, %arg4 = %c112 : index, %arg5 = %c112 : index) {
%c2_0 = constant 2 : index
%c28 = constant 28 : index
hal.command_buffer.dispatch.symbol<%arg2 : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv::@conv_dispatch_0) workgroups([%c2_0, %c28, %c28])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
return %buffer : !hal.buffer
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::PublicABIGenerationPass ***
module {
hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
hal.interface @legacy_io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.target @vulkan_spirv, filter="vulkan*" {
hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c2 = constant 2 : index
%c28 = constant 28 : index
%c28_0 = constant 28 : index
hal.return %c2, %c28, %c28_0 : index, index, index
}
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 1 : i32
%5 = spv.Constant 3 : i32
%6 = spv.Constant 0 : i32
%7 = spv.Constant 16 : i32
%8 = spv.Constant 8 : i32
%9 = spv.Constant 72 : i32
%10 = spv.Constant 24 : i32
%11 = spv.Constant 675 : i32
%12 = spv.Constant 896 : i32
%13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
%19 = spv.Load "Input" %16 : vector<3xi32>
%20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
%21 = spv.Load "Input" %16 : vector<3xi32>
%22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
%23 = spv.IMul %22, %1 : i32
%24 = spv.IMul %20, %1 : i32
%25 = spv.IMul %18, %7 : i32
%26 = spv.IMul %22, %8 : i32
%27 = spv.IMul %20, %8 : i32
%28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%29 = spv.Load "Input" %28 : vector<3xi32>
%30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
%31 = spv.Load "Input" %28 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.Load "Input" %28 : vector<3xi32>
%34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
%35 = spv.IMul %34, %1 : i32
%36 = spv.IMul %30, %1 : i32
%37 = spv.IMul %34, %8 : i32
%38 = spv.IMul %32, %2 : i32
%39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%79 = spv.SLessThan %74, %5 : i32
spv.BranchConditional %79, ^bb2, ^bb3
^bb2: // pred: ^bb1
%80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%94 = spv.SLessThan %89, %5 : i32
spv.BranchConditional %94, ^bb2, ^bb3
^bb2: // pred: ^bb1
%95 = spv.IAdd %25, %36 : i32
%96 = spv.SDiv %95, %1 : i32
%97 = spv.IMul %74, %9 : i32
%98 = spv.IMul %89, %10 : i32
%99 = spv.IAdd %97, %98 : i32
%100 = spv.IAdd %99, %96 : i32
%101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
%103 = spv.IAdd %99, %8 : i32
%104 = spv.IAdd %103, %96 : i32
%105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
%107 = spv.IAdd %99, %7 : i32
%108 = spv.IAdd %107, %96 : i32
%109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%111 = spv.IAdd %37, %74 : i32
%112 = spv.IAdd %38, %89 : i32
%113 = spv.IAdd %26, %111 : i32
%114 = spv.IAdd %27, %112 : i32
%115 = spv.IMul %113, %11 : i32
%116 = spv.IMul %114, %5 : i32
%117 = spv.IAdd %115, %116 : i32
%118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : f32
%120 = spv.IAdd %117, %4 : i32
%121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%122 = spv.Load "StorageBuffer" %121 : f32
%123 = spv.IAdd %117, %2 : i32
%124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
%127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
%128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
%129 = spv.FMul %128, %102 : vector<4xf32>
%130 = spv.FAdd %129, %90 : vector<4xf32>
%131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.FMul %132, %106 : vector<4xf32>
%134 = spv.FAdd %133, %130 : vector<4xf32>
%135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
%136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
%137 = spv.FMul %136, %110 : vector<4xf32>
%138 = spv.FAdd %137, %134 : vector<4xf32>
%139 = spv.IAdd %74, %2 : i32
%140 = spv.IAdd %37, %139 : i32
%141 = spv.IAdd %26, %140 : i32
%142 = spv.IMul %141, %11 : i32
%143 = spv.IAdd %142, %116 : i32
%144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%145 = spv.Load "StorageBuffer" %144 : f32
%146 = spv.IAdd %143, %4 : i32
%147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%148 = spv.Load "StorageBuffer" %147 : f32
%149 = spv.IAdd %143, %2 : i32
%150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%151 = spv.Load "StorageBuffer" %150 : f32
%152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
%153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
%154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
%155 = spv.FMul %154, %102 : vector<4xf32>
%156 = spv.FAdd %155, %91 : vector<4xf32>
%157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
%158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
%159 = spv.FMul %158, %106 : vector<4xf32>
%160 = spv.FAdd %159, %156 : vector<4xf32>
%161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
%162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
%163 = spv.FMul %162, %110 : vector<4xf32>
%164 = spv.FAdd %163, %160 : vector<4xf32>
%165 = spv.IAdd %74, %1 : i32
%166 = spv.IAdd %37, %165 : i32
%167 = spv.IAdd %26, %166 : i32
%168 = spv.IMul %167, %11 : i32
%169 = spv.IAdd %168, %116 : i32
%170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.IAdd %169, %4 : i32
%173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%174 = spv.Load "StorageBuffer" %173 : f32
%175 = spv.IAdd %169, %2 : i32
%176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.FMul %180, %102 : vector<4xf32>
%182 = spv.FAdd %181, %92 : vector<4xf32>
%183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
%185 = spv.FMul %184, %106 : vector<4xf32>
%186 = spv.FAdd %185, %182 : vector<4xf32>
%187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
%189 = spv.FMul %188, %110 : vector<4xf32>
%190 = spv.FAdd %189, %186 : vector<4xf32>
%191 = spv.IAdd %74, %3 : i32
%192 = spv.IAdd %37, %191 : i32
%193 = spv.IAdd %26, %192 : i32
%194 = spv.IMul %193, %11 : i32
%195 = spv.IAdd %194, %116 : i32
%196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%197 = spv.Load "StorageBuffer" %196 : f32
%198 = spv.IAdd %195, %4 : i32
%199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%200 = spv.Load "StorageBuffer" %199 : f32
%201 = spv.IAdd %195, %2 : i32
%202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
%205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
%206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
%207 = spv.FMul %206, %102 : vector<4xf32>
%208 = spv.FAdd %207, %93 : vector<4xf32>
%209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
%210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
%211 = spv.FMul %210, %106 : vector<4xf32>
%212 = spv.FAdd %211, %208 : vector<4xf32>
%213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
%214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
%215 = spv.FMul %214, %110 : vector<4xf32>
%216 = spv.FAdd %215, %212 : vector<4xf32>
spv.Store "Function" %80, %138 : vector<4xf32>
spv.Store "Function" %81, %164 : vector<4xf32>
spv.Store "Function" %82, %190 : vector<4xf32>
spv.Store "Function" %83, %216 : vector<4xf32>
%217 = spv.IAdd %89, %4 : i32
spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%84 = spv.Load "Function" %83 : vector<4xf32>
%85 = spv.Load "Function" %82 : vector<4xf32>
%86 = spv.Load "Function" %81 : vector<4xf32>
%87 = spv.Load "Function" %80 : vector<4xf32>
spv.Store "Function" %39, %87 : vector<4xf32>
spv.Store "Function" %40, %86 : vector<4xf32>
spv.Store "Function" %41, %85 : vector<4xf32>
spv.Store "Function" %42, %84 : vector<4xf32>
%88 = spv.IAdd %74, %4 : i32
spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%43 = spv.Load "Function" %42 : vector<4xf32>
%44 = spv.Load "Function" %41 : vector<4xf32>
%45 = spv.Load "Function" %40 : vector<4xf32>
%46 = spv.Load "Function" %39 : vector<4xf32>
%47 = spv.IAdd %35, %5 : i32
%48 = spv.IAdd %23, %47 : i32
%49 = spv.IAdd %24, %32 : i32
%50 = spv.IAdd %25, %36 : i32
%51 = spv.SDiv %50, %1 : i32
%52 = spv.IMul %48, %12 : i32
%53 = spv.IMul %49, %8 : i32
%54 = spv.IAdd %52, %53 : i32
%55 = spv.IAdd %54, %51 : i32
%56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
%57 = spv.IAdd %35, %2 : i32
%58 = spv.IAdd %23, %57 : i32
%59 = spv.IMul %58, %12 : i32
%60 = spv.IAdd %59, %53 : i32
%61 = spv.IAdd %60, %51 : i32
%62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
%63 = spv.IAdd %35, %4 : i32
%64 = spv.IAdd %23, %63 : i32
%65 = spv.IMul %64, %12 : i32
%66 = spv.IAdd %65, %53 : i32
%67 = spv.IAdd %66, %51 : i32
%68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
%69 = spv.IAdd %23, %35 : i32
%70 = spv.IMul %69, %12 : i32
%71 = spv.IAdd %70, %53 : i32
%72 = spv.IAdd %71, %51 : i32
%73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%c1605632 = constant 1605632 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
%c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
])
hal.device.switch<%device : !hal.device>
#hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32 : index, %arg4 = %c112 : index, %arg5 = %c112 : index) {
%c2_0 = constant 2 : index
%c28 = constant 28 : index
hal.command_buffer.dispatch.symbol<%arg2 : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv::@conv_dispatch_0) workgroups([%c2_0, %c28, %c28])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
return %buffer : !hal.buffer
}
func @conv$async(%arg0: !hal.semaphore, %arg1: index, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.semaphore, %arg5: index) -> !hal.buffer_view attributes {iree.module.export = "conv$async"} {
%0 = hal.semaphore.await<%arg0 : !hal.semaphore> until(%arg1) : i32
hal.check_success %0, "semaphore wait failed"
%buffer = hal.buffer_view.buffer %arg2 : !hal.buffer
%buffer_0 = hal.buffer_view.buffer %arg3 : !hal.buffer
%1 = call @conv(%buffer, %buffer_0) : (!hal.buffer, !hal.buffer) -> !hal.buffer
%c1 = constant 1 : index
%c112 = constant 112 : index
%c112_1 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%view = hal.buffer_view.create %1, element_type = %c50331680_i32, shape = [%c1, %c112, %c112_1, %c32] : !hal.buffer -> !hal.buffer_view
hal.semaphore.signal<%arg4 : !hal.semaphore> value(%arg5)
return %view : !hal.buffer_view
}
func @conv$sync(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export = "conv", iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c0 = constant 0 : index
%c1 = constant 1 : index
%device = hal.ex.shared_device : !hal.device
%semaphore = hal.semaphore.create device(%device : !hal.device) initial(%c0) : !hal.semaphore
%0 = call @conv$async(%semaphore, %c0, %arg0, %arg1, %semaphore, %c1) : (!hal.semaphore, index, !hal.buffer_view, !hal.buffer_view, !hal.semaphore, index) -> !hal.buffer_view
%1 = hal.semaphore.await<%semaphore : !hal.semaphore> until(%c1) : i32
hal.check_success %1, "semaphore wait failed"
return %0 : !hal.buffer_view
}
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::ResolveEntryPointOrdinalsPass ***
module {
hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
hal.interface @legacy_io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.target @vulkan_spirv, filter="vulkan*" {
hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c2 = constant 2 : index
%c28 = constant 28 : index
hal.return %c2, %c28, %c28 : index, index, index
}
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 1 : i32
%5 = spv.Constant 3 : i32
%6 = spv.Constant 0 : i32
%7 = spv.Constant 16 : i32
%8 = spv.Constant 8 : i32
%9 = spv.Constant 72 : i32
%10 = spv.Constant 24 : i32
%11 = spv.Constant 675 : i32
%12 = spv.Constant 896 : i32
%13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
%19 = spv.Load "Input" %16 : vector<3xi32>
%20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
%21 = spv.Load "Input" %16 : vector<3xi32>
%22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
%23 = spv.IMul %22, %1 : i32
%24 = spv.IMul %20, %1 : i32
%25 = spv.IMul %18, %7 : i32
%26 = spv.IMul %22, %8 : i32
%27 = spv.IMul %20, %8 : i32
%28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%29 = spv.Load "Input" %28 : vector<3xi32>
%30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
%31 = spv.Load "Input" %28 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.Load "Input" %28 : vector<3xi32>
%34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
%35 = spv.IMul %34, %1 : i32
%36 = spv.IMul %30, %1 : i32
%37 = spv.IMul %34, %8 : i32
%38 = spv.IMul %32, %2 : i32
%39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%79 = spv.SLessThan %74, %5 : i32
spv.BranchConditional %79, ^bb2, ^bb3
^bb2: // pred: ^bb1
%80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%94 = spv.SLessThan %89, %5 : i32
spv.BranchConditional %94, ^bb2, ^bb3
^bb2: // pred: ^bb1
%95 = spv.IAdd %25, %36 : i32
%96 = spv.SDiv %95, %1 : i32
%97 = spv.IMul %74, %9 : i32
%98 = spv.IMul %89, %10 : i32
%99 = spv.IAdd %97, %98 : i32
%100 = spv.IAdd %99, %96 : i32
%101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
%103 = spv.IAdd %99, %8 : i32
%104 = spv.IAdd %103, %96 : i32
%105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
%107 = spv.IAdd %99, %7 : i32
%108 = spv.IAdd %107, %96 : i32
%109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%111 = spv.IAdd %37, %74 : i32
%112 = spv.IAdd %38, %89 : i32
%113 = spv.IAdd %26, %111 : i32
%114 = spv.IAdd %27, %112 : i32
%115 = spv.IMul %113, %11 : i32
%116 = spv.IMul %114, %5 : i32
%117 = spv.IAdd %115, %116 : i32
%118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : f32
%120 = spv.IAdd %117, %4 : i32
%121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%122 = spv.Load "StorageBuffer" %121 : f32
%123 = spv.IAdd %117, %2 : i32
%124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
%127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
%128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
%129 = spv.FMul %128, %102 : vector<4xf32>
%130 = spv.FAdd %129, %90 : vector<4xf32>
%131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.FMul %132, %106 : vector<4xf32>
%134 = spv.FAdd %133, %130 : vector<4xf32>
%135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
%136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
%137 = spv.FMul %136, %110 : vector<4xf32>
%138 = spv.FAdd %137, %134 : vector<4xf32>
%139 = spv.IAdd %74, %2 : i32
%140 = spv.IAdd %37, %139 : i32
%141 = spv.IAdd %26, %140 : i32
%142 = spv.IMul %141, %11 : i32
%143 = spv.IAdd %142, %116 : i32
%144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%145 = spv.Load "StorageBuffer" %144 : f32
%146 = spv.IAdd %143, %4 : i32
%147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%148 = spv.Load "StorageBuffer" %147 : f32
%149 = spv.IAdd %143, %2 : i32
%150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%151 = spv.Load "StorageBuffer" %150 : f32
%152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
%153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
%154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
%155 = spv.FMul %154, %102 : vector<4xf32>
%156 = spv.FAdd %155, %91 : vector<4xf32>
%157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
%158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
%159 = spv.FMul %158, %106 : vector<4xf32>
%160 = spv.FAdd %159, %156 : vector<4xf32>
%161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
%162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
%163 = spv.FMul %162, %110 : vector<4xf32>
%164 = spv.FAdd %163, %160 : vector<4xf32>
%165 = spv.IAdd %74, %1 : i32
%166 = spv.IAdd %37, %165 : i32
%167 = spv.IAdd %26, %166 : i32
%168 = spv.IMul %167, %11 : i32
%169 = spv.IAdd %168, %116 : i32
%170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.IAdd %169, %4 : i32
%173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%174 = spv.Load "StorageBuffer" %173 : f32
%175 = spv.IAdd %169, %2 : i32
%176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.FMul %180, %102 : vector<4xf32>
%182 = spv.FAdd %181, %92 : vector<4xf32>
%183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
%185 = spv.FMul %184, %106 : vector<4xf32>
%186 = spv.FAdd %185, %182 : vector<4xf32>
%187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
%189 = spv.FMul %188, %110 : vector<4xf32>
%190 = spv.FAdd %189, %186 : vector<4xf32>
%191 = spv.IAdd %74, %3 : i32
%192 = spv.IAdd %37, %191 : i32
%193 = spv.IAdd %26, %192 : i32
%194 = spv.IMul %193, %11 : i32
%195 = spv.IAdd %194, %116 : i32
%196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%197 = spv.Load "StorageBuffer" %196 : f32
%198 = spv.IAdd %195, %4 : i32
%199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%200 = spv.Load "StorageBuffer" %199 : f32
%201 = spv.IAdd %195, %2 : i32
%202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
%205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
%206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
%207 = spv.FMul %206, %102 : vector<4xf32>
%208 = spv.FAdd %207, %93 : vector<4xf32>
%209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
%210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
%211 = spv.FMul %210, %106 : vector<4xf32>
%212 = spv.FAdd %211, %208 : vector<4xf32>
%213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
%214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
%215 = spv.FMul %214, %110 : vector<4xf32>
%216 = spv.FAdd %215, %212 : vector<4xf32>
spv.Store "Function" %80, %138 : vector<4xf32>
spv.Store "Function" %81, %164 : vector<4xf32>
spv.Store "Function" %82, %190 : vector<4xf32>
spv.Store "Function" %83, %216 : vector<4xf32>
%217 = spv.IAdd %89, %4 : i32
spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%84 = spv.Load "Function" %83 : vector<4xf32>
%85 = spv.Load "Function" %82 : vector<4xf32>
%86 = spv.Load "Function" %81 : vector<4xf32>
%87 = spv.Load "Function" %80 : vector<4xf32>
spv.Store "Function" %39, %87 : vector<4xf32>
spv.Store "Function" %40, %86 : vector<4xf32>
spv.Store "Function" %41, %85 : vector<4xf32>
spv.Store "Function" %42, %84 : vector<4xf32>
%88 = spv.IAdd %74, %4 : i32
spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%43 = spv.Load "Function" %42 : vector<4xf32>
%44 = spv.Load "Function" %41 : vector<4xf32>
%45 = spv.Load "Function" %40 : vector<4xf32>
%46 = spv.Load "Function" %39 : vector<4xf32>
%47 = spv.IAdd %35, %5 : i32
%48 = spv.IAdd %23, %47 : i32
%49 = spv.IAdd %24, %32 : i32
%50 = spv.IAdd %25, %36 : i32
%51 = spv.SDiv %50, %1 : i32
%52 = spv.IMul %48, %12 : i32
%53 = spv.IMul %49, %8 : i32
%54 = spv.IAdd %52, %53 : i32
%55 = spv.IAdd %54, %51 : i32
%56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
%57 = spv.IAdd %35, %2 : i32
%58 = spv.IAdd %23, %57 : i32
%59 = spv.IMul %58, %12 : i32
%60 = spv.IAdd %59, %53 : i32
%61 = spv.IAdd %60, %51 : i32
%62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
%63 = spv.IAdd %35, %4 : i32
%64 = spv.IAdd %23, %63 : i32
%65 = spv.IMul %64, %12 : i32
%66 = spv.IAdd %65, %53 : i32
%67 = spv.IAdd %66, %51 : i32
%68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
%69 = spv.IAdd %23, %35 : i32
%70 = spv.IMul %69, %12 : i32
%71 = spv.IAdd %70, %53 : i32
%72 = spv.IAdd %71, %51 : i32
%73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%c1605632 = constant 1605632 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
%c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
%c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
])
hal.device.switch<%device : !hal.device>
#hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32 : index, %arg4 = %c112 : index, %arg5 = %c112 : index) {
%c2_0 = constant 2 : index
%c28 = constant 28 : index
%0 = hal.command_buffer.device<%arg2 : !hal.command_buffer> : !hal.device
%exe = hal.executable.lookup device(%0 : !hal.device) executable(@conv_dispatch_0) : !hal.executable
hal.command_buffer.dispatch<%arg2 : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c2_0, %c28, %c28])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
return %buffer : !hal.buffer
}
func @conv$async(%arg0: !hal.semaphore, %arg1: index, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.semaphore, %arg5: index) -> !hal.buffer_view attributes {iree.module.export = "conv$async"} {
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%0 = hal.semaphore.await<%arg0 : !hal.semaphore> until(%arg1) : i32
hal.check_success %0, "semaphore wait failed"
%buffer = hal.buffer_view.buffer %arg2 : !hal.buffer
%buffer_0 = hal.buffer_view.buffer %arg3 : !hal.buffer
%1 = call @conv(%buffer, %buffer_0) : (!hal.buffer, !hal.buffer) -> !hal.buffer
%view = hal.buffer_view.create %1, element_type = %c50331680_i32, shape = [%c1, %c112, %c112, %c32] : !hal.buffer -> !hal.buffer_view
hal.semaphore.signal<%arg4 : !hal.semaphore> value(%arg5)
return %view : !hal.buffer_view
}
func @conv$sync(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export = "conv", iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c0 = constant 0 : index
%c1 = constant 1 : index
%device = hal.ex.shared_device : !hal.device
%semaphore = hal.semaphore.create device(%device : !hal.device) initial(%c0) : !hal.semaphore
%0 = call @conv$async(%semaphore, %c0, %arg0, %arg1, %semaphore, %c1) : (!hal.semaphore, index, !hal.buffer_view, !hal.buffer_view, !hal.semaphore, index) -> !hal.buffer_view
%1 = hal.semaphore.await<%semaphore : !hal.semaphore> until(%c1) : i32
hal.check_success %1, "semaphore wait failed"
return %0 : !hal.buffer_view
}
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeResourceCachesPass ***
module {
hal.variable @_descriptor_set_layout_0 init(@_descriptor_set_layout_0_initializer) : !hal.descriptor_set_layout attributes {sym_visibility = "private"}
func private @_descriptor_set_layout_0_initializer() -> !hal.descriptor_set_layout {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
return %descriptor_set_layout : !hal.descriptor_set_layout
}
hal.variable @_executable_layout_0 init(@_executable_layout_0_initializer) : !hal.executable_layout attributes {sym_visibility = "private"}
func private @_executable_layout_0_initializer() -> !hal.executable_layout {
%0 = hal.variable.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%0]) : !hal.executable_layout
return %executable_layout : !hal.executable_layout
}
hal.variable @_executable_conv_dispatch_0 init(@_executable_conv_dispatch_0_initializer) : !hal.executable attributes {sym_visibility = "private"}
func private @_executable_conv_dispatch_0_initializer() -> !hal.executable {
%device = hal.ex.shared_device : !hal.device
%0 = hal.device.switch<%device : !hal.device> -> !hal.executable
#hal.device.match.id<"vulkan*">(%arg0 = %device : !hal.device) {
%1 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%arg0 : !hal.device) target(@conv_dispatch_0::@vulkan_spirv) layouts([%1]) : !hal.executable
hal.return %exe : !hal.executable
},
#hal.match.always() {
%1 = iree.null : !hal.executable
hal.return %1 : !hal.executable
}
return %0 : !hal.executable
}
hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
hal.interface @legacy_io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.target @vulkan_spirv, filter="vulkan*" {
hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c2 = constant 2 : index
%c28 = constant 28 : index
hal.return %c2, %c28, %c28 : index, index, index
}
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 1 : i32
%5 = spv.Constant 3 : i32
%6 = spv.Constant 0 : i32
%7 = spv.Constant 16 : i32
%8 = spv.Constant 8 : i32
%9 = spv.Constant 72 : i32
%10 = spv.Constant 24 : i32
%11 = spv.Constant 675 : i32
%12 = spv.Constant 896 : i32
%13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
%19 = spv.Load "Input" %16 : vector<3xi32>
%20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
%21 = spv.Load "Input" %16 : vector<3xi32>
%22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
%23 = spv.IMul %22, %1 : i32
%24 = spv.IMul %20, %1 : i32
%25 = spv.IMul %18, %7 : i32
%26 = spv.IMul %22, %8 : i32
%27 = spv.IMul %20, %8 : i32
%28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%29 = spv.Load "Input" %28 : vector<3xi32>
%30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
%31 = spv.Load "Input" %28 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.Load "Input" %28 : vector<3xi32>
%34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
%35 = spv.IMul %34, %1 : i32
%36 = spv.IMul %30, %1 : i32
%37 = spv.IMul %34, %8 : i32
%38 = spv.IMul %32, %2 : i32
%39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%79 = spv.SLessThan %74, %5 : i32
spv.BranchConditional %79, ^bb2, ^bb3
^bb2: // pred: ^bb1
%80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%94 = spv.SLessThan %89, %5 : i32
spv.BranchConditional %94, ^bb2, ^bb3
^bb2: // pred: ^bb1
%95 = spv.IAdd %25, %36 : i32
%96 = spv.SDiv %95, %1 : i32
%97 = spv.IMul %74, %9 : i32
%98 = spv.IMul %89, %10 : i32
%99 = spv.IAdd %97, %98 : i32
%100 = spv.IAdd %99, %96 : i32
%101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
%103 = spv.IAdd %99, %8 : i32
%104 = spv.IAdd %103, %96 : i32
%105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
%107 = spv.IAdd %99, %7 : i32
%108 = spv.IAdd %107, %96 : i32
%109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%111 = spv.IAdd %37, %74 : i32
%112 = spv.IAdd %38, %89 : i32
%113 = spv.IAdd %26, %111 : i32
%114 = spv.IAdd %27, %112 : i32
%115 = spv.IMul %113, %11 : i32
%116 = spv.IMul %114, %5 : i32
%117 = spv.IAdd %115, %116 : i32
%118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : f32
%120 = spv.IAdd %117, %4 : i32
%121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%122 = spv.Load "StorageBuffer" %121 : f32
%123 = spv.IAdd %117, %2 : i32
%124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
%127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
%128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
%129 = spv.FMul %128, %102 : vector<4xf32>
%130 = spv.FAdd %129, %90 : vector<4xf32>
%131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.FMul %132, %106 : vector<4xf32>
%134 = spv.FAdd %133, %130 : vector<4xf32>
%135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
%136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
%137 = spv.FMul %136, %110 : vector<4xf32>
%138 = spv.FAdd %137, %134 : vector<4xf32>
%139 = spv.IAdd %74, %2 : i32
%140 = spv.IAdd %37, %139 : i32
%141 = spv.IAdd %26, %140 : i32
%142 = spv.IMul %141, %11 : i32
%143 = spv.IAdd %142, %116 : i32
%144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%145 = spv.Load "StorageBuffer" %144 : f32
%146 = spv.IAdd %143, %4 : i32
%147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%148 = spv.Load "StorageBuffer" %147 : f32
%149 = spv.IAdd %143, %2 : i32
%150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%151 = spv.Load "StorageBuffer" %150 : f32
%152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
%153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
%154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
%155 = spv.FMul %154, %102 : vector<4xf32>
%156 = spv.FAdd %155, %91 : vector<4xf32>
%157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
%158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
%159 = spv.FMul %158, %106 : vector<4xf32>
%160 = spv.FAdd %159, %156 : vector<4xf32>
%161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
%162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
%163 = spv.FMul %162, %110 : vector<4xf32>
%164 = spv.FAdd %163, %160 : vector<4xf32>
%165 = spv.IAdd %74, %1 : i32
%166 = spv.IAdd %37, %165 : i32
%167 = spv.IAdd %26, %166 : i32
%168 = spv.IMul %167, %11 : i32
%169 = spv.IAdd %168, %116 : i32
%170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.IAdd %169, %4 : i32
%173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%174 = spv.Load "StorageBuffer" %173 : f32
%175 = spv.IAdd %169, %2 : i32
%176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.FMul %180, %102 : vector<4xf32>
%182 = spv.FAdd %181, %92 : vector<4xf32>
%183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
%185 = spv.FMul %184, %106 : vector<4xf32>
%186 = spv.FAdd %185, %182 : vector<4xf32>
%187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
%189 = spv.FMul %188, %110 : vector<4xf32>
%190 = spv.FAdd %189, %186 : vector<4xf32>
%191 = spv.IAdd %74, %3 : i32
%192 = spv.IAdd %37, %191 : i32
%193 = spv.IAdd %26, %192 : i32
%194 = spv.IMul %193, %11 : i32
%195 = spv.IAdd %194, %116 : i32
%196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%197 = spv.Load "StorageBuffer" %196 : f32
%198 = spv.IAdd %195, %4 : i32
%199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%200 = spv.Load "StorageBuffer" %199 : f32
%201 = spv.IAdd %195, %2 : i32
%202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
%205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
%206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
%207 = spv.FMul %206, %102 : vector<4xf32>
%208 = spv.FAdd %207, %93 : vector<4xf32>
%209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
%210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
%211 = spv.FMul %210, %106 : vector<4xf32>
%212 = spv.FAdd %211, %208 : vector<4xf32>
%213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
%214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
%215 = spv.FMul %214, %110 : vector<4xf32>
%216 = spv.FAdd %215, %212 : vector<4xf32>
spv.Store "Function" %80, %138 : vector<4xf32>
spv.Store "Function" %81, %164 : vector<4xf32>
spv.Store "Function" %82, %190 : vector<4xf32>
spv.Store "Function" %83, %216 : vector<4xf32>
%217 = spv.IAdd %89, %4 : i32
spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%84 = spv.Load "Function" %83 : vector<4xf32>
%85 = spv.Load "Function" %82 : vector<4xf32>
%86 = spv.Load "Function" %81 : vector<4xf32>
%87 = spv.Load "Function" %80 : vector<4xf32>
spv.Store "Function" %39, %87 : vector<4xf32>
spv.Store "Function" %40, %86 : vector<4xf32>
spv.Store "Function" %41, %85 : vector<4xf32>
spv.Store "Function" %42, %84 : vector<4xf32>
%88 = spv.IAdd %74, %4 : i32
spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%43 = spv.Load "Function" %42 : vector<4xf32>
%44 = spv.Load "Function" %41 : vector<4xf32>
%45 = spv.Load "Function" %40 : vector<4xf32>
%46 = spv.Load "Function" %39 : vector<4xf32>
%47 = spv.IAdd %35, %5 : i32
%48 = spv.IAdd %23, %47 : i32
%49 = spv.IAdd %24, %32 : i32
%50 = spv.IAdd %25, %36 : i32
%51 = spv.SDiv %50, %1 : i32
%52 = spv.IMul %48, %12 : i32
%53 = spv.IMul %49, %8 : i32
%54 = spv.IAdd %52, %53 : i32
%55 = spv.IAdd %54, %51 : i32
%56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
%57 = spv.IAdd %35, %2 : i32
%58 = spv.IAdd %23, %57 : i32
%59 = spv.IMul %58, %12 : i32
%60 = spv.IAdd %59, %53 : i32
%61 = spv.IAdd %60, %51 : i32
%62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
%63 = spv.IAdd %35, %4 : i32
%64 = spv.IAdd %23, %63 : i32
%65 = spv.IMul %64, %12 : i32
%66 = spv.IAdd %65, %53 : i32
%67 = spv.IAdd %66, %51 : i32
%68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
%69 = spv.IAdd %23, %35 : i32
%70 = spv.IMul %69, %12 : i32
%71 = spv.IAdd %70, %53 : i32
%72 = spv.IAdd %71, %51 : i32
%73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%c1605632 = constant 1605632 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%0 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
%c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
])
hal.device.switch<%device : !hal.device>
#hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32 : index, %arg4 = %c112 : index, %arg5 = %c112 : index) {
%c2_0 = constant 2 : index
%c28 = constant 28 : index
%1 = hal.command_buffer.device<%arg2 : !hal.command_buffer> : !hal.device
%2 = hal.variable.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%arg2 : !hal.command_buffer> target(%2 : !hal.executable)[0] workgroups([%c2_0, %c28, %c28])
hal.return
}
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
return %buffer : !hal.buffer
}
func @conv$async(%arg0: !hal.semaphore, %arg1: index, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.semaphore, %arg5: index) -> !hal.buffer_view attributes {iree.module.export = "conv$async"} {
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%0 = hal.semaphore.await<%arg0 : !hal.semaphore> until(%arg1) : i32
hal.check_success %0, "semaphore wait failed"
%buffer = hal.buffer_view.buffer %arg2 : !hal.buffer
%buffer_0 = hal.buffer_view.buffer %arg3 : !hal.buffer
%1 = call @conv(%buffer, %buffer_0) : (!hal.buffer, !hal.buffer) -> !hal.buffer
%view = hal.buffer_view.create %1, element_type = %c50331680_i32, shape = [%c1, %c112, %c112, %c32] : !hal.buffer -> !hal.buffer_view
hal.semaphore.signal<%arg4 : !hal.semaphore> value(%arg5)
return %view : !hal.buffer_view
}
func @conv$sync(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export = "conv", iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c0 = constant 0 : index
%c1 = constant 1 : index
%device = hal.ex.shared_device : !hal.device
%semaphore = hal.semaphore.create device(%device : !hal.device) initial(%c0) : !hal.semaphore
%0 = call @conv$async(%semaphore, %c0, %arg0, %arg1, %semaphore, %c1) : (!hal.semaphore, index, !hal.buffer_view, !hal.buffer_view, !hal.semaphore, index) -> !hal.buffer_view
%1 = hal.semaphore.await<%semaphore : !hal.semaphore> until(%c1) : i32
hal.check_success %1, "semaphore wait failed"
return %0 : !hal.buffer_view
}
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass ***
func private @_executable_conv_dispatch_0_initializer() -> !hal.executable {
%device = hal.ex.shared_device : !hal.device
%0 = hal.device.match.id<%device : !hal.device> pattern("vulkan*") : i1
cond_br %0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%1 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv) layouts([%1]) : !hal.executable
br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = constant true
cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%2 = iree.null : !hal.executable
br ^bb5(%2 : !hal.executable)
^bb4: // pred: ^bb2
iree.unreachable
^bb5(%3: !hal.executable): // 2 preds: ^bb1, ^bb3
return %3 : !hal.executable
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass ***
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%c1605632 = constant 1605632 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%0 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
%c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
])
%1 = hal.device.match.id<%device : !hal.device> pattern("vulkan*") : i1
cond_br %1, ^bb1, ^bb2
^bb1: // pred: ^bb0
%c2_0 = constant 2 : index
%c28 = constant 28 : index
%2 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
%3 = hal.variable.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%3 : !hal.executable)[0] workgroups([%c2_0, %c28, %c28])
br ^bb3
^bb2: // pred: ^bb0
iree.unreachable
^bb3: // pred: ^bb1
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
return %buffer : !hal.buffer
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::MemoizeDeviceQueriesPass ***
module {
hal.variable @_device_match_id_0 init(@_device_match_id_0_initializer) : i1 attributes {sym_visibility = "private"}
func private @_device_match_id_0_initializer() -> i1 {
%device = hal.ex.shared_device : !hal.device
%0 = hal.device.match.id<%device : !hal.device> pattern("vulkan*") : i1
return %0 : i1
}
hal.variable @_descriptor_set_layout_0 init(@_descriptor_set_layout_0_initializer) : !hal.descriptor_set_layout attributes {sym_visibility = "private"}
func private @_descriptor_set_layout_0_initializer() -> !hal.descriptor_set_layout {
%device = hal.ex.shared_device : !hal.device
%descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
return %descriptor_set_layout : !hal.descriptor_set_layout
}
hal.variable @_executable_layout_0 init(@_executable_layout_0_initializer) : !hal.executable_layout attributes {sym_visibility = "private"}
func private @_executable_layout_0_initializer() -> !hal.executable_layout {
%0 = hal.variable.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
%device = hal.ex.shared_device : !hal.device
%executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%0]) : !hal.executable_layout
return %executable_layout : !hal.executable_layout
}
hal.variable @_executable_conv_dispatch_0 init(@_executable_conv_dispatch_0_initializer) : !hal.executable attributes {sym_visibility = "private"}
func private @_executable_conv_dispatch_0_initializer() -> !hal.executable {
%device = hal.ex.shared_device : !hal.device
%0 = hal.variable.load @_device_match_id_0 : i1
cond_br %0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%1 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv) layouts([%1]) : !hal.executable
br ^bb5(%exe : !hal.executable)
^bb2: // pred: ^bb0
%true = constant true
cond_br %true, ^bb3, ^bb4
^bb3: // pred: ^bb2
%2 = iree.null : !hal.executable
br ^bb5(%2 : !hal.executable)
^bb4: // pred: ^bb2
iree.unreachable
^bb5(%3: !hal.executable): // 2 preds: ^bb1, ^bb3
return %3 : !hal.executable
}
hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
hal.interface @legacy_io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.target @vulkan_spirv, filter="vulkan*" {
hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
^bb0(%arg0: index, %arg1: index, %arg2: index): // no predecessors
%c2 = constant 2 : index
%c28 = constant 28 : index
hal.return %c2, %c28, %c28 : index, index, index
}
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>} {
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
spv.func @conv_dispatch_0() "None" {
%0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
%1 = spv.Constant 4 : i32
%2 = spv.Constant 2 : i32
%3 = spv.Constant 6 : i32
%4 = spv.Constant 1 : i32
%5 = spv.Constant 3 : i32
%6 = spv.Constant 0 : i32
%7 = spv.Constant 16 : i32
%8 = spv.Constant 8 : i32
%9 = spv.Constant 72 : i32
%10 = spv.Constant 24 : i32
%11 = spv.Constant 675 : i32
%12 = spv.Constant 896 : i32
%13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
%14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
%16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
%17 = spv.Load "Input" %16 : vector<3xi32>
%18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
%19 = spv.Load "Input" %16 : vector<3xi32>
%20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
%21 = spv.Load "Input" %16 : vector<3xi32>
%22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
%23 = spv.IMul %22, %1 : i32
%24 = spv.IMul %20, %1 : i32
%25 = spv.IMul %18, %7 : i32
%26 = spv.IMul %22, %8 : i32
%27 = spv.IMul %20, %8 : i32
%28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
%29 = spv.Load "Input" %28 : vector<3xi32>
%30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
%31 = spv.Load "Input" %28 : vector<3xi32>
%32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
%33 = spv.Load "Input" %28 : vector<3xi32>
%34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
%35 = spv.IMul %34, %1 : i32
%36 = spv.IMul %30, %1 : i32
%37 = spv.IMul %34, %8 : i32
%38 = spv.IMul %32, %2 : i32
%39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%79 = spv.SLessThan %74, %5 : i32
spv.BranchConditional %79, ^bb2, ^bb3
^bb2: // pred: ^bb1
%80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
%83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
spv.mlir.loop {
spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>): // 2 preds: ^bb0, ^bb2
%94 = spv.SLessThan %89, %5 : i32
spv.BranchConditional %94, ^bb2, ^bb3
^bb2: // pred: ^bb1
%95 = spv.IAdd %25, %36 : i32
%96 = spv.SDiv %95, %1 : i32
%97 = spv.IMul %74, %9 : i32
%98 = spv.IMul %89, %10 : i32
%99 = spv.IAdd %97, %98 : i32
%100 = spv.IAdd %99, %96 : i32
%101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
%103 = spv.IAdd %99, %8 : i32
%104 = spv.IAdd %103, %96 : i32
%105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
%107 = spv.IAdd %99, %7 : i32
%108 = spv.IAdd %107, %96 : i32
%109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
%110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
%111 = spv.IAdd %37, %74 : i32
%112 = spv.IAdd %38, %89 : i32
%113 = spv.IAdd %26, %111 : i32
%114 = spv.IAdd %27, %112 : i32
%115 = spv.IMul %113, %11 : i32
%116 = spv.IMul %114, %5 : i32
%117 = spv.IAdd %115, %116 : i32
%118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%119 = spv.Load "StorageBuffer" %118 : f32
%120 = spv.IAdd %117, %4 : i32
%121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%122 = spv.Load "StorageBuffer" %121 : f32
%123 = spv.IAdd %117, %2 : i32
%124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%125 = spv.Load "StorageBuffer" %124 : f32
%126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
%127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
%128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
%129 = spv.FMul %128, %102 : vector<4xf32>
%130 = spv.FAdd %129, %90 : vector<4xf32>
%131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
%132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
%133 = spv.FMul %132, %106 : vector<4xf32>
%134 = spv.FAdd %133, %130 : vector<4xf32>
%135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
%136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
%137 = spv.FMul %136, %110 : vector<4xf32>
%138 = spv.FAdd %137, %134 : vector<4xf32>
%139 = spv.IAdd %74, %2 : i32
%140 = spv.IAdd %37, %139 : i32
%141 = spv.IAdd %26, %140 : i32
%142 = spv.IMul %141, %11 : i32
%143 = spv.IAdd %142, %116 : i32
%144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%145 = spv.Load "StorageBuffer" %144 : f32
%146 = spv.IAdd %143, %4 : i32
%147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%148 = spv.Load "StorageBuffer" %147 : f32
%149 = spv.IAdd %143, %2 : i32
%150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%151 = spv.Load "StorageBuffer" %150 : f32
%152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
%153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
%154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
%155 = spv.FMul %154, %102 : vector<4xf32>
%156 = spv.FAdd %155, %91 : vector<4xf32>
%157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
%158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
%159 = spv.FMul %158, %106 : vector<4xf32>
%160 = spv.FAdd %159, %156 : vector<4xf32>
%161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
%162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
%163 = spv.FMul %162, %110 : vector<4xf32>
%164 = spv.FAdd %163, %160 : vector<4xf32>
%165 = spv.IAdd %74, %1 : i32
%166 = spv.IAdd %37, %165 : i32
%167 = spv.IAdd %26, %166 : i32
%168 = spv.IMul %167, %11 : i32
%169 = spv.IAdd %168, %116 : i32
%170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%171 = spv.Load "StorageBuffer" %170 : f32
%172 = spv.IAdd %169, %4 : i32
%173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%174 = spv.Load "StorageBuffer" %173 : f32
%175 = spv.IAdd %169, %2 : i32
%176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%177 = spv.Load "StorageBuffer" %176 : f32
%178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
%179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
%180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
%181 = spv.FMul %180, %102 : vector<4xf32>
%182 = spv.FAdd %181, %92 : vector<4xf32>
%183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
%184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
%185 = spv.FMul %184, %106 : vector<4xf32>
%186 = spv.FAdd %185, %182 : vector<4xf32>
%187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
%188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
%189 = spv.FMul %188, %110 : vector<4xf32>
%190 = spv.FAdd %189, %186 : vector<4xf32>
%191 = spv.IAdd %74, %3 : i32
%192 = spv.IAdd %37, %191 : i32
%193 = spv.IAdd %26, %192 : i32
%194 = spv.IMul %193, %11 : i32
%195 = spv.IAdd %194, %116 : i32
%196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%197 = spv.Load "StorageBuffer" %196 : f32
%198 = spv.IAdd %195, %4 : i32
%199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%200 = spv.Load "StorageBuffer" %199 : f32
%201 = spv.IAdd %195, %2 : i32
%202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
%203 = spv.Load "StorageBuffer" %202 : f32
%204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
%205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
%206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
%207 = spv.FMul %206, %102 : vector<4xf32>
%208 = spv.FAdd %207, %93 : vector<4xf32>
%209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
%210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
%211 = spv.FMul %210, %106 : vector<4xf32>
%212 = spv.FAdd %211, %208 : vector<4xf32>
%213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
%214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
%215 = spv.FMul %214, %110 : vector<4xf32>
%216 = spv.FAdd %215, %212 : vector<4xf32>
spv.Store "Function" %80, %138 : vector<4xf32>
spv.Store "Function" %81, %164 : vector<4xf32>
spv.Store "Function" %82, %190 : vector<4xf32>
spv.Store "Function" %83, %216 : vector<4xf32>
%217 = spv.IAdd %89, %4 : i32
spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%84 = spv.Load "Function" %83 : vector<4xf32>
%85 = spv.Load "Function" %82 : vector<4xf32>
%86 = spv.Load "Function" %81 : vector<4xf32>
%87 = spv.Load "Function" %80 : vector<4xf32>
spv.Store "Function" %39, %87 : vector<4xf32>
spv.Store "Function" %40, %86 : vector<4xf32>
spv.Store "Function" %41, %85 : vector<4xf32>
spv.Store "Function" %42, %84 : vector<4xf32>
%88 = spv.IAdd %74, %4 : i32
spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
^bb3: // pred: ^bb1
spv.mlir.merge
}
%43 = spv.Load "Function" %42 : vector<4xf32>
%44 = spv.Load "Function" %41 : vector<4xf32>
%45 = spv.Load "Function" %40 : vector<4xf32>
%46 = spv.Load "Function" %39 : vector<4xf32>
%47 = spv.IAdd %35, %5 : i32
%48 = spv.IAdd %23, %47 : i32
%49 = spv.IAdd %24, %32 : i32
%50 = spv.IAdd %25, %36 : i32
%51 = spv.SDiv %50, %1 : i32
%52 = spv.IMul %48, %12 : i32
%53 = spv.IMul %49, %8 : i32
%54 = spv.IAdd %52, %53 : i32
%55 = spv.IAdd %54, %51 : i32
%56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
%57 = spv.IAdd %35, %2 : i32
%58 = spv.IAdd %23, %57 : i32
%59 = spv.IMul %58, %12 : i32
%60 = spv.IAdd %59, %53 : i32
%61 = spv.IAdd %60, %51 : i32
%62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
%63 = spv.IAdd %35, %4 : i32
%64 = spv.IAdd %23, %63 : i32
%65 = spv.IMul %64, %12 : i32
%66 = spv.IAdd %65, %53 : i32
%67 = spv.IAdd %66, %51 : i32
%68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
%69 = spv.IAdd %23, %35 : i32
%70 = spv.IMul %69, %12 : i32
%71 = spv.IAdd %70, %53 : i32
%72 = spv.IAdd %71, %51 : i32
%73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
spv.Return
}
spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%c1605632 = constant 1605632 : index
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%0 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
%c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
])
%1 = hal.variable.load @_device_match_id_0 : i1
cond_br %1, ^bb1, ^bb2
^bb1: // pred: ^bb0
%c2_0 = constant 2 : index
%c28 = constant 28 : index
%2 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
%3 = hal.variable.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%3 : !hal.executable)[0] workgroups([%c2_0, %c28, %c28])
br ^bb3
^bb2: // pred: ^bb0
iree.unreachable
^bb3: // pred: ^bb1
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
return %buffer : !hal.buffer
}
func @conv$async(%arg0: !hal.semaphore, %arg1: index, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.semaphore, %arg5: index) -> !hal.buffer_view attributes {iree.module.export = "conv$async"} {
%c1 = constant 1 : index
%c112 = constant 112 : index
%c32 = constant 32 : index
%c50331680_i32 = constant 50331680 : i32
%0 = hal.semaphore.await<%arg0 : !hal.semaphore> until(%arg1) : i32
hal.check_success %0, "semaphore wait failed"
%buffer = hal.buffer_view.buffer %arg2 : !hal.buffer
%buffer_0 = hal.buffer_view.buffer %arg3 : !hal.buffer
%1 = call @conv(%buffer, %buffer_0) : (!hal.buffer, !hal.buffer) -> !hal.buffer
%view = hal.buffer_view.create %1, element_type = %c50331680_i32, shape = [%c1, %c112, %c112, %c32] : !hal.buffer -> !hal.buffer_view
hal.semaphore.signal<%arg4 : !hal.semaphore> value(%arg5)
return %view : !hal.buffer_view
}
func @conv$sync(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export = "conv", iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%c0 = constant 0 : index
%c1 = constant 1 : index
%device = hal.ex.shared_device : !hal.device
%semaphore = hal.semaphore.create device(%device : !hal.device) initial(%c0) : !hal.semaphore
%0 = call @conv$async(%semaphore, %c0, %arg0, %arg1, %semaphore, %c1) : (!hal.semaphore, index, !hal.buffer_view, !hal.buffer_view, !hal.semaphore, index) -> !hal.buffer_view
%1 = hal.semaphore.await<%semaphore : !hal.semaphore> until(%c1) : i32
hal.check_success %1, "semaphore wait failed"
return %0 : !hal.buffer_view
}
}
// *** IR Dump After Canonicalizer ***
func private @_executable_conv_dispatch_0_initializer() -> !hal.executable {
%device = hal.ex.shared_device : !hal.device
%0 = hal.variable.load @_device_match_id_0 : i1
cond_br %0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%1 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
%exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv) layouts([%1]) : !hal.executable
br ^bb3(%exe : !hal.executable)
^bb2: // pred: ^bb0
%2 = iree.null : !hal.executable
br ^bb3(%2 : !hal.executable)
^bb3(%3: !hal.executable): // 2 preds: ^bb1, ^bb2
return %3 : !hal.executable
}
// *** IR Dump After mlir::iree_compiler::IREE::HAL::SerializeExecutablesPass ***
hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
hal.interface @legacy_io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.binary @vulkan_spirv attributes {data = dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>, format = 1397773893 : i32} {
}
}
// *** IR Dump After Canonicalizer ***
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
%c3456 = constant 3456 : index
%c607500 = constant 607500 : index
%c1605632 = constant 1605632 : index
%c1 = constant 1 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%c28 = constant 28 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
%cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
hal.command_buffer.begin<%cmd : !hal.command_buffer>
%0 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%0 : !hal.executable_layout)[%c0] bindings([
%c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
%c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
%c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
])
%1 = hal.variable.load @_device_match_id_0 : i1
cond_br %1, ^bb1, ^bb2
^bb1: // pred: ^bb0
%2 = hal.variable.load @_executable_conv_dispatch_0 : !hal.executable
hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%2 : !hal.executable)[0] workgroups([%c2, %c28, %c28])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
hal.command_buffer.end<%cmd : !hal.command_buffer>
hal.ex.submit_and_wait %device, %cmd
return %buffer : !hal.buffer
^bb2: // pred: ^bb0
iree.unreachable
}
// *** IR Dump After mlir::iree_compiler::IREE::VM::ConversionPass ***
module {
vm.module @module {
vm.global.i32 @_device_match_id_0 init(@_device_match_id_0_initializer) : i32
vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
vm.func private @_device_match_id_0_initializer() -> i32 {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
%0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
vm.return %0 : i32
}
vm.global.ref @_descriptor_set_layout_0 init(@_descriptor_set_layout_0_initializer) : !vm.ref<!hal.descriptor_set_layout>
vm.func private @_descriptor_set_layout_0_initializer() -> !vm.ref<!hal.descriptor_set_layout> {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c7 = vm.const.i32 7 : i32
%c1_0 = vm.const.i32 1 : i32
%c1_1 = vm.const.i32 1 : i32
%c7_2 = vm.const.i32 7 : i32
%c1_3 = vm.const.i32 1 : i32
%c2 = vm.const.i32 2 : i32
%c7_4 = vm.const.i32 7 : i32
%c6 = vm.const.i32 6 : i32
%ref_5 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7, %c1_0), (%c1_1, %c7_2, %c1_3), (%c2, %c7_4, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
vm.return %ref_5 : !vm.ref<!hal.descriptor_set_layout>
}
vm.global.ref @_executable_layout_0 init(@_executable_layout_0_initializer) : !vm.ref<!hal.executable_layout>
vm.func private @_executable_layout_0_initializer() -> !vm.ref<!hal.executable_layout> {
%_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%zero = vm.const.i32.zero : i32
%ref_0 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.return %ref_0 : !vm.ref<!hal.executable_layout>
}
vm.global.ref @_executable_conv_dispatch_0 init(@_executable_conv_dispatch_0_initializer) : !vm.ref<!hal.executable>
vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
vm.func private @_executable_conv_dispatch_0_initializer() -> !vm.ref<!hal.executable> {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%c1397773893 = vm.const.i32 1397773893 : i32
%_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
%ref_0 = vm.call.variadic @hal.executable.create(%ref, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_0 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%0: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.return %0 : !vm.ref<!hal.executable>
}
vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
%c3456 = vm.const.i32 3456 : i32
%c607500 = vm.const.i32 607500 : i32
%c1605632 = vm.const.i32 1605632 : i32
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c2 = vm.const.i32 2 : i32
%c28 = vm.const.i32 28 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%c50 = vm.const.i32 50 : i32
%c14 = vm.const.i32 14 : i32
%ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
%c1_2 = vm.const.i32 1 : i32
%c3 = vm.const.i32 3 : i32
%ref_3 = vm.call @hal.command_buffer.create(%ref, %c1_2, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_3) : (!vm.ref<!hal.command_buffer>) -> ()
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_3, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
%zero_4 = vm.const.i32.zero : i32
vm.call @hal.command_buffer.dispatch(%ref_3, %_executable_conv_dispatch_0, %zero_4, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
%c20 = vm.const.i32 20 : i32
%c5 = vm.const.i32 5 : i32
%zero_5 = vm.const.i32.zero : i32
vm.call @hal.command_buffer.execution_barrier(%ref_3, %c20, %c5, %zero_5) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_3) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref, %ref_3) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
vm.return %ref_1 : !vm.ref<!hal.buffer>
^bb2: // pred: ^bb0
%c2_6 = vm.const.i32 2 : i32
vm.fail %c2_6, "unreachable location reached"
}
vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_fail %0, "semaphore wait failed"
%ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
vm.return %ref_2 : !vm.ref<!hal.buffer_view>
}
vm.export @conv$async
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%zero = vm.const.i32.zero : i32
%c1 = vm.const.i32 1 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
%ref_1 = vm.call @conv$async(%ref_0, %zero, %arg0, %arg1, %ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.semaphore>, i32) -> !vm.ref<!hal.buffer_view>
%0 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_fail %0, "semaphore wait failed"
vm.return %ref_1 : !vm.ref<!hal.buffer_view>
}
vm.export @conv$sync as("conv")
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.ref<!iree.byte_buffer>, %offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.allocator(%buffer : !vm.ref<!hal.buffer>) -> !vm.ref<!hal.allocator> attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.fill(%target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.ref<!iree.byte_buffer>, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i32) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
}
}
// *** IR Dump After mlir::iree_compiler::IREE::VM::GlobalInitializationPass ***
vm.module @module {
vm.global.i32 @_device_match_id_0 mutable : i32
vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
vm.func private @_device_match_id_0_initializer() -> i32 {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
%0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
vm.return %0 : i32
}
vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
vm.func private @_descriptor_set_layout_0_initializer() -> !vm.ref<!hal.descriptor_set_layout> {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c7 = vm.const.i32 7 : i32
%c1_0 = vm.const.i32 1 : i32
%c1_1 = vm.const.i32 1 : i32
%c7_2 = vm.const.i32 7 : i32
%c1_3 = vm.const.i32 1 : i32
%c2 = vm.const.i32 2 : i32
%c7_4 = vm.const.i32 7 : i32
%c6 = vm.const.i32 6 : i32
%ref_5 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7, %c1_0), (%c1_1, %c7_2, %c1_3), (%c2, %c7_4, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
vm.return %ref_5 : !vm.ref<!hal.descriptor_set_layout>
}
vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
vm.func private @_executable_layout_0_initializer() -> !vm.ref<!hal.executable_layout> {
%_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%zero = vm.const.i32.zero : i32
%ref_0 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.return %ref_0 : !vm.ref<!hal.executable_layout>
}
vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
vm.func private @_executable_conv_dispatch_0_initializer() -> !vm.ref<!hal.executable> {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%c1397773893 = vm.const.i32 1397773893 : i32
%_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
%ref_0 = vm.call.variadic @hal.executable.create(%ref, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_0 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%0: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.return %0 : !vm.ref<!hal.executable>
}
vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
%c3456 = vm.const.i32 3456 : i32
%c607500 = vm.const.i32 607500 : i32
%c1605632 = vm.const.i32 1605632 : i32
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c2 = vm.const.i32 2 : i32
%c28 = vm.const.i32 28 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%c50 = vm.const.i32 50 : i32
%c14 = vm.const.i32 14 : i32
%ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
%c1_2 = vm.const.i32 1 : i32
%c3 = vm.const.i32 3 : i32
%ref_3 = vm.call @hal.command_buffer.create(%ref, %c1_2, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_3) : (!vm.ref<!hal.command_buffer>) -> ()
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_3, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
%zero_4 = vm.const.i32.zero : i32
vm.call @hal.command_buffer.dispatch(%ref_3, %_executable_conv_dispatch_0, %zero_4, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
%c20 = vm.const.i32 20 : i32
%c5 = vm.const.i32 5 : i32
%zero_5 = vm.const.i32.zero : i32
vm.call @hal.command_buffer.execution_barrier(%ref_3, %c20, %c5, %zero_5) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_3) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref, %ref_3) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
vm.return %ref_1 : !vm.ref<!hal.buffer>
^bb2: // pred: ^bb0
%c2_6 = vm.const.i32 2 : i32
vm.fail %c2_6, "unreachable location reached"
}
vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_fail %0, "semaphore wait failed"
%ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
vm.return %ref_2 : !vm.ref<!hal.buffer_view>
}
vm.export @conv$async
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%zero = vm.const.i32.zero : i32
%c1 = vm.const.i32 1 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
%ref_1 = vm.call @conv$async(%ref_0, %zero, %arg0, %arg1, %ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.semaphore>, i32) -> !vm.ref<!hal.buffer_view>
%0 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_fail %0, "semaphore wait failed"
vm.return %ref_1 : !vm.ref<!hal.buffer_view>
}
vm.export @conv$sync as("conv")
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.ref<!iree.byte_buffer>, %offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.allocator(%buffer : !vm.ref<!hal.buffer>) -> !vm.ref<!hal.allocator> attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.fill(%target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.ref<!iree.byte_buffer>, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i32) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
vm.func @__init() {
%0 = vm.call @_device_match_id_0_initializer() : () -> i32
vm.global.store.i32 %0, @_device_match_id_0 : i32
%ref = vm.call @_descriptor_set_layout_0_initializer() : () -> !vm.ref<!hal.descriptor_set_layout>
vm.global.store.ref %ref, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%ref_0 = vm.call @_executable_layout_0_initializer() : () -> !vm.ref<!hal.executable_layout>
vm.global.store.ref %ref_0, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%ref_1 = vm.call @_executable_conv_dispatch_0_initializer() : () -> !vm.ref<!hal.executable>
vm.global.store.ref %ref_1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.export @__init
}
// *** IR Dump After Canonicalizer ***
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%zero = vm.const.i32.zero : i32
%c1 = vm.const.i32 1 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
%ref_1 = vm.call @conv$async(%ref_0, %zero, %arg0, %arg1, %ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.semaphore>, i32) -> !vm.ref<!hal.buffer_view>
%0 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
vm.return %ref_1 : !vm.ref<!hal.buffer_view>
^bb2(%1: i32): // pred: ^bb0
vm.fail %1, "semaphore wait failed"
}
// *** IR Dump After Canonicalizer ***
vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
vm.return %ref_2 : !vm.ref<!hal.buffer_view>
^bb2(%1: i32): // pred: ^bb0
vm.fail %1, "semaphore wait failed"
}
// *** IR Dump After Canonicalizer ***
vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
%c3456 = vm.const.i32 3456 : i32
%c607500 = vm.const.i32 607500 : i32
%c1605632 = vm.const.i32 1605632 : i32
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c2 = vm.const.i32 2 : i32
%c28 = vm.const.i32 28 : i32
%c50 = vm.const.i32 50 : i32
%c14 = vm.const.i32 14 : i32
%c3 = vm.const.i32 3 : i32
%c20 = vm.const.i32 20 : i32
%c5 = vm.const.i32 5 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
vm.return %ref_1 : !vm.ref<!hal.buffer>
^bb2: // pred: ^bb0
vm.fail %c2, "unreachable location reached"
}
// *** IR Dump After Canonicalizer ***
vm.func private @_executable_conv_dispatch_0_initializer() -> !vm.ref<!hal.executable> {
%c1397773893 = vm.const.i32 1397773893 : i32
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
%ref_0 = vm.call.variadic @hal.executable.create(%ref, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_0 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%0: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.return %0 : !vm.ref<!hal.executable>
}
// *** IR Dump After Canonicalizer ***
vm.func private @_executable_layout_0_initializer() -> !vm.ref<!hal.executable_layout> {
%zero = vm.const.i32.zero : i32
%_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.return %ref_0 : !vm.ref<!hal.executable_layout>
}
// *** IR Dump After Canonicalizer ***
vm.func private @_descriptor_set_layout_0_initializer() -> !vm.ref<!hal.descriptor_set_layout> {
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c7 = vm.const.i32 7 : i32
%c2 = vm.const.i32 2 : i32
%c6 = vm.const.i32 6 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
vm.return %ref_0 : !vm.ref<!hal.descriptor_set_layout>
}
// *** IR Dump After Canonicalizer ***
vm.func @__init() {
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c7 = vm.const.i32 7 : i32
%c2 = vm.const.i32 2 : i32
%c6 = vm.const.i32 6 : i32
%c1397773893 = vm.const.i32 1397773893 : i32
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
%0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
vm.global.store.i32 %0, @_device_match_id_0 : i32
%ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
%ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%1: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
// *** IR Dump After Canonicalizer ***
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%zero = vm.const.i32.zero : i32
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
%0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
%1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %1, ^bb2(%1 : i32), ^bb3
^bb2(%2: i32): // 2 preds: ^bb0, ^bb1
vm.fail %2, "semaphore wait failed"
^bb3: // pred: ^bb1
vm.return %ref_4 : !vm.ref<!hal.buffer_view>
}
// *** IR Dump After Inliner ***
module {
vm.module @module {
vm.global.i32 @_device_match_id_0 mutable : i32
vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
%c3456 = vm.const.i32 3456 : i32
%c607500 = vm.const.i32 607500 : i32
%c1605632 = vm.const.i32 1605632 : i32
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c2 = vm.const.i32 2 : i32
%c28 = vm.const.i32 28 : i32
%c50 = vm.const.i32 50 : i32
%c14 = vm.const.i32 14 : i32
%c3 = vm.const.i32 3 : i32
%c20 = vm.const.i32 20 : i32
%c5 = vm.const.i32 5 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
vm.return %ref_1 : !vm.ref<!hal.buffer>
^bb2: // pred: ^bb0
vm.fail %c2, "unreachable location reached"
}
vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
vm.return %ref_2 : !vm.ref<!hal.buffer_view>
^bb2(%1: i32): // pred: ^bb0
vm.fail %1, "semaphore wait failed"
}
vm.export @conv$async
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%zero = vm.const.i32.zero : i32
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
%0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
%1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %1, ^bb2(%1 : i32), ^bb3
^bb2(%2: i32): // 2 preds: ^bb0, ^bb1
vm.fail %2, "semaphore wait failed"
^bb3: // pred: ^bb1
vm.return %ref_4 : !vm.ref<!hal.buffer_view>
}
vm.export @conv$sync as("conv")
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.ref<!iree.byte_buffer>, %offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.allocator(%buffer : !vm.ref<!hal.buffer>) -> !vm.ref<!hal.allocator> attributes {sym_visibility = "private"}
vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer.fill(%target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> i32 attributes {sym_visibility = "private"}
vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.trace(%key : !vm.ref<!iree.byte_buffer>, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i32 ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i32) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
vm.func @__init() {
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c7 = vm.const.i32 7 : i32
%c2 = vm.const.i32 2 : i32
%c6 = vm.const.i32 6 : i32
%c1397773893 = vm.const.i32 1397773893 : i32
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
%0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
vm.global.store.i32 %0, @_device_match_id_0 : i32
%ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
%ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%1: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.export @__init
}
}
// *** IR Dump After SymbolDCE ***
module {
vm.module @module {
vm.global.i32 @_device_match_id_0 mutable : i32
vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
%c3456 = vm.const.i32 3456 : i32
%c607500 = vm.const.i32 607500 : i32
%c1605632 = vm.const.i32 1605632 : i32
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c2 = vm.const.i32 2 : i32
%c28 = vm.const.i32 28 : i32
%c50 = vm.const.i32 50 : i32
%c14 = vm.const.i32 14 : i32
%c3 = vm.const.i32 3 : i32
%c20 = vm.const.i32 20 : i32
%c5 = vm.const.i32 5 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
vm.return %ref_1 : !vm.ref<!hal.buffer>
^bb2: // pred: ^bb0
vm.fail %c2, "unreachable location reached"
}
vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
vm.return %ref_2 : !vm.ref<!hal.buffer_view>
^bb2(%1: i32): // pred: ^bb0
vm.fail %1, "semaphore wait failed"
}
vm.export @conv$async
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%zero = vm.const.i32.zero : i32
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
%0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
%1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %1, ^bb2(%1 : i32), ^bb3
^bb2(%2: i32): // 2 preds: ^bb0, ^bb1
vm.fail %2, "semaphore wait failed"
^bb3: // pred: ^bb1
vm.return %ref_4 : !vm.ref<!hal.buffer_view>
}
vm.export @conv$sync as("conv")
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
vm.func @__init() {
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c7 = vm.const.i32 7 : i32
%c2 = vm.const.i32 2 : i32
%c6 = vm.const.i32 6 : i32
%c1397773893 = vm.const.i32 1397773893 : i32
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
%0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
vm.global.store.i32 %0, @_device_match_id_0 : i32
%ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
%ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%1: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.export @__init
}
}
// *** IR Dump After mlir::iree_compiler::IREE::VM::SinkDefiningOpsPass ***
vm.module @module {
vm.global.i32 @_device_match_id_0 mutable : i32
vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%c1605632 = vm.const.i32 1605632 : i32
%c50 = vm.const.i32 50 : i32
%c14 = vm.const.i32 14 : i32
%ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
%c1 = vm.const.i32 1 : i32
%c3 = vm.const.i32 3 : i32
%ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%c3456 = vm.const.i32 3456 : i32
%c607500 = vm.const.i32 607500 : i32
%zero = vm.const.i32.zero : i32
%c2 = vm.const.i32 2 : i32
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
%c28 = vm.const.i32 28 : i32
vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
%c20 = vm.const.i32 20 : i32
%c5 = vm.const.i32 5 : i32
vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
vm.return %ref_1 : !vm.ref<!hal.buffer>
^bb2: // pred: ^bb0
vm.fail %c2, "unreachable location reached"
}
vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
%0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
vm.return %ref_2 : !vm.ref<!hal.buffer_view>
^bb2(%1: i32): // pred: ^bb0
vm.fail %1, "semaphore wait failed"
}
vm.export @conv$async
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%zero = vm.const.i32.zero : i32
%ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
%0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
%1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %1, ^bb2(%1 : i32), ^bb3
^bb2(%2: i32): // 2 preds: ^bb0, ^bb1
vm.fail %2, "semaphore wait failed"
^bb3: // pred: ^bb1
vm.return %ref_4 : !vm.ref<!hal.buffer_view>
}
vm.export @conv$sync as("conv")
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
vm.func @__init() {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
%0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
vm.global.store.i32 %0, @_device_match_id_0 : i32
%ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c7 = vm.const.i32 7 : i32
%c2 = vm.const.i32 2 : i32
%c6 = vm.const.i32 6 : i32
%ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
%c1397773893 = vm.const.i32 1397773893 : i32
%ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%1: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.export @__init
}
module {
vm.module @module {
vm.global.i32 @_device_match_id_0 mutable : i32
vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
%c1605632 = vm.const.i32 1605632 : i32
%c50 = vm.const.i32 50 : i32
%c14 = vm.const.i32 14 : i32
%ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
%c1 = vm.const.i32 1 : i32
%c3 = vm.const.i32 3 : i32
%ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%c3456 = vm.const.i32 3456 : i32
%c607500 = vm.const.i32 607500 : i32
%zero = vm.const.i32.zero : i32
%c2 = vm.const.i32 2 : i32
vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
%c28 = vm.const.i32 28 : i32
vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
%c20 = vm.const.i32 20 : i32
%c5 = vm.const.i32 5 : i32
vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
vm.return %ref_1 : !vm.ref<!hal.buffer>
^bb2: // pred: ^bb0
vm.fail %c2, "unreachable location reached"
}
vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
%0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
vm.return %ref_2 : !vm.ref<!hal.buffer_view>
^bb2(%1: i32): // pred: ^bb0
vm.fail %1, "semaphore wait failed"
}
vm.export @conv$async
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%zero = vm.const.i32.zero : i32
%ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
%0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1: // pred: ^bb0
%ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
%ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
%c1 = vm.const.i32 1 : i32
%c112 = vm.const.i32 112 : i32
%c32 = vm.const.i32 32 : i32
%c50331680 = vm.const.i32 50331680 : i32
%ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
%1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
vm.cond_br %1, ^bb2(%1 : i32), ^bb3
^bb2(%2: i32): // 2 preds: ^bb0, ^bb1
vm.fail %2, "semaphore wait failed"
^bb3: // pred: ^bb1
vm.return %ref_4 : !vm.ref<!hal.buffer_view>
}
vm.export @conv$sync as("conv")
vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
vm.func @__init() {
%ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
%0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
vm.global.store.i32 %0, @_device_match_id_0 : i32
%ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%c1 = vm.const.i32 1 : i32
%zero = vm.const.i32.zero : i32
%c7 = vm.const.i32 7 : i32
%c2 = vm.const.i32 2 : i32
%c6 = vm.const.i32 6 : i32
%ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
%ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
%_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1: // pred: ^bb0
%_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
%_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
%c1397773893 = vm.const.i32 1397773893 : i32
%ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
^bb2: // pred: ^bb0
%null = vm.const.ref.zero : !vm.ref<!hal.executable>
vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%1: !vm.ref<!hal.executable>): // 2 preds: ^bb1, ^bb2
vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
vm.return
}
vm.export @__init
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment