antiagainst/tile-vectorize-conv.mlir

## tile-vectorize-conv.mlir
// *** IR Dump After mlir::iree_compiler::IREE::SIP::MaterializeReflectionAttrsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  return %0 : tensor<1x112x112x32xf32>
}

// *** IR Dump After mlir::iree_compiler::IREE::Flow::LegalizeInputTypesPass ***
module  {
  func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    return %0 : tensor<1x112x112x32xf32>
  }
}


// *** IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionDynamicDimsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  return %0 : tensor<1x112x112x32xf32>
}

// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ConvertHLOToLinalgOnTensorsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %cst = constant 0.000000e+00 : f32
  %1 = linalg.fill(%0, %cst) : tensor<1x112x112x32xf32>, f32 -> tensor<1x112x112x32xf32>
  %2 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%1 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  return %2 : tensor<1x112x112x32xf32>
}

// *** IR Dump After LinalgFoldUnitExtentDims ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %cst = constant 0.000000e+00 : f32
  %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
  %1 = linalg.fill(%0, %cst) : tensor<1x112x112x32xf32>, f32 -> tensor<1x112x112x32xf32>
  %2 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%1 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
  return %2 : tensor<1x112x112x32xf32>
}

// *** IR Dump After mlir::iree_compiler::IREE::Flow::(anonymous namespace)::DispatchLinalgOnTensorsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %0 = flow.dispatch.workgroups[%c32, %c112, %c112](%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
    %cst = constant 0.000000e+00 : f32
    %c32_0 = constant 32 : index
    %c112_1 = constant 112 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %1 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
    %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
    %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
    %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
    %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
    %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
    %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
    %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
    %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
    %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
    %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
    %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
    scf.for %arg5 = %2 to %c112_1 step %3 {
      %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
      %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
      scf.for %arg6 = %4 to %c112_1 step %5 {
        %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
        %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
        scf.for %arg7 = %6 to %c32_0 step %7 {
          %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
          %9 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%workgroup_size_2, %arg5)
          %10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
          %11 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%workgroup_size_1, %arg6)
          %12 = flow.dispatch.tensor.load %arg2, offsets = [%c0, %8, %10, %c0], sizes = [%c1, %9, %11, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
          %13 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg7, %workgroup_size_0)
          %14 = flow.dispatch.tensor.load %arg3, offsets = [%c0, %c0, %c0, %arg7], sizes = [%c3, %c3, %c3, %13], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
          %15 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg5, %workgroup_size_2)
          %16 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg6, %workgroup_size_1)
          %17 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg7, %workgroup_size_0)
          %18 = subtensor %1[0, %arg5, %arg6, %arg7] [1, %15, %16, %17] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x?x?x?xf32>
          %19 = linalg.fill(%18, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
          %20 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %14 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%19 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
          %21 = tensor.cast %20 : tensor<1x?x?x?xf32> to tensor<?x?x?x?xf32>
          %22 = tensor.cast %21 : tensor<?x?x?x?xf32> to tensor<1x?x?x?xf32>
          flow.dispatch.tensor.store %22, %arg4, offsets = [%c0, %arg5, %arg6, %arg7], sizes = [%c1, %15, %16, %17], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
        }
      }
    }
    flow.return
  }
  return %0 : tensor<1x112x112x32xf32>
}

// *** IR Dump After Canonicalizer ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %0 = flow.dispatch.workgroups[%c32, %c112, %c112](%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg3: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg4: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
    %cst = constant 0.000000e+00 : f32
    %c32_0 = constant 32 : index
    %c112_1 = constant 112 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
    %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
    %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
    %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
    %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
    %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
    %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
    %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
    %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
    %1 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_2, %workgroup_size_2]
    %2 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_2, %workgroup_size_2]
    scf.for %arg5 = %1 to %c112_1 step %2 {
      %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_1, %workgroup_size_1]
      %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_1, %workgroup_size_1]
      scf.for %arg6 = %3 to %c112_1 step %4 {
        %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
        %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
        scf.for %arg7 = %5 to %c32_0 step %6 {
          %7 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg5)
          %8 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%workgroup_size_2, %arg5)
          %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg6)
          %10 = affine.min affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>(%workgroup_size_1, %arg6)
          %11 = flow.dispatch.tensor.load %arg2, offsets = [%c0, %7, %9, %c0], sizes = [%c1, %8, %10, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
          %12 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg7, %workgroup_size_0)
          %13 = flow.dispatch.tensor.load %arg3, offsets = [%c0, %c0, %c0, %arg7], sizes = [%c3, %c3, %c3, %12], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
          %14 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg5, %workgroup_size_2)
          %15 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 112)>(%arg6, %workgroup_size_1)
          %16 = affine.min affine_map<(d0, d1) -> (d1, -d0 + 32)>(%arg7, %workgroup_size_0)
          %17 = linalg.init_tensor [1, %14, %15, %16] : tensor<1x?x?x?xf32>
          %18 = linalg.fill(%17, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
          %19 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%11, %13 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%18 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
          flow.dispatch.tensor.store %19, %arg4, offsets = [%c0, %arg5, %arg6, %arg7], sizes = [%c1, %14, %15, %16], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
        }
      }
    }
    flow.return
  }
  return %0 : tensor<1x112x112x32xf32>
}

// *** IR Dump After mlir::iree_compiler::IREE::Flow::OutlineDispatchRegions2Pass ***
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0) -> (d0 * 2)>
#map2 = affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>
#map3 = affine_map<(d0, d1) -> (d1, -d0 + 32)>
#map4 = affine_map<(d0, d1) -> (d1, -d0 + 112)>
module  {
  flow.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
    flow.dispatch.entry @conv_dispatch_0 attributes {signature = (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>, workgroup_rank = 3 : index}
    module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %cst = constant 0.000000e+00 : f32
        %c32 = constant 32 : index
        %c112 = constant 112 : index
        %c0 = constant 0 : index
        %c1 = constant 1 : index
        %c3 = constant 3 : index
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply #map1(%arg3)
              %7 = affine.min #map2(%workgroup_size_2, %arg3)
              %8 = affine.apply #map1(%arg4)
              %9 = affine.min #map2(%workgroup_size_1, %arg4)
              %10 = flow.dispatch.tensor.load %arg0, offsets = [%c0, %6, %8, %c0], sizes = [%c1, %7, %9, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min #map3(%arg5, %workgroup_size_0)
              %12 = flow.dispatch.tensor.load %arg1, offsets = [%c0, %c0, %c0, %arg5], sizes = [%c3, %c3, %c3, %11], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min #map4(%arg3, %workgroup_size_2)
              %14 = affine.min #map4(%arg4, %workgroup_size_1)
              %15 = affine.min #map3(%arg5, %workgroup_size_0)
              %16 = linalg.init_tensor [1, %13, %14, %15] : tensor<1x?x?x?xf32>
              %17 = linalg.fill(%16, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
              %18 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%17 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %18, %arg2, offsets = [%c0, %arg3, %arg4, %arg5], sizes = [%c1, %13, %14, %15], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %c32 = constant 32 : index
    %c112 = constant 112 : index
    %0 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    return %0 : tensor<1x112x112x32xf32>
  }
}


// *** IR Dump After mlir::iree_compiler::IREE::Flow::(anonymous namespace)::HoistUnstreamableOps ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %0 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
  return %0 : tensor<1x112x112x32xf32>
}

// *** IR Dump After mlir::iree_compiler::IREE::Flow::FormStreamsPass ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %0 = flow.ex.stream.fragment(%c32, %c112, %arg0, %arg1) : (index, index, tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: index, %arg3: index, %arg4: tensor<1x225x225x3xf32>, %arg5: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
    %1 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%arg2, %arg3, %arg3](%arg4, %arg5) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    flow.return %1 : tensor<1x112x112x32xf32>
  }
  return %0 : tensor<1x112x112x32xf32>
}

// *** IR Dump After Canonicalizer ***
func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %0 = flow.ex.stream.fragment(%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
      (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %1 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
    flow.return %1 : tensor<1x112x112x32xf32>
  }
  return %0 : tensor<1x112x112x32xf32>
}

// *** IR Dump After Canonicalizer ***
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0) -> (d0 * 2)>
#map2 = affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>
#map3 = affine_map<(d0)[s0] -> (s0, -d0 + 32)>
#map4 = affine_map<(d0)[s0] -> (s0, -d0 + 112)>
module  {
  flow.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
    flow.dispatch.entry @conv_dispatch_0 attributes {signature = (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>, workgroup_rank = 3 : index}
    module  {
      func @conv_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:1x225x225x3xf32>, %arg1: !flow.dispatch.tensor<readonly:3x3x3x32xf32>, %arg2: !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) {
        %cst = constant 0.000000e+00 : f32
        %c32 = constant 32 : index
        %c112 = constant 112 : index
        %c0 = constant 0 : index
        %c1 = constant 1 : index
        %c3 = constant 3 : index
        %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
        %workgroup_size_1 = flow.dispatch.workgroup.size[1] : index
        %workgroup_size_2 = flow.dispatch.workgroup.size[2] : index
        %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
        %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
        %workgroup_id_1 = flow.dispatch.workgroup.id[1] : index
        %workgroup_count_1 = flow.dispatch.workgroup.count[1] : index
        %workgroup_id_2 = flow.dispatch.workgroup.id[2] : index
        %workgroup_count_2 = flow.dispatch.workgroup.count[2] : index
        %0 = affine.apply #map0()[%workgroup_id_2, %workgroup_size_2]
        %1 = affine.apply #map0()[%workgroup_count_2, %workgroup_size_2]
        scf.for %arg3 = %0 to %c112 step %1 {
          %2 = affine.apply #map0()[%workgroup_id_1, %workgroup_size_1]
          %3 = affine.apply #map0()[%workgroup_count_1, %workgroup_size_1]
          scf.for %arg4 = %2 to %c112 step %3 {
            %4 = affine.apply #map0()[%workgroup_id_0, %workgroup_size_0]
            %5 = affine.apply #map0()[%workgroup_count_0, %workgroup_size_0]
            scf.for %arg5 = %4 to %c32 step %5 {
              %6 = affine.apply #map1(%arg3)
              %7 = affine.min #map2(%arg3)[%workgroup_size_2]
              %8 = affine.apply #map1(%arg4)
              %9 = affine.min #map2(%arg4)[%workgroup_size_1]
              %10 = flow.dispatch.tensor.load %arg0, offsets = [%c0, %6, %8, %c0], sizes = [%c1, %7, %9, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
              %11 = affine.min #map3(%arg5)[%workgroup_size_0]
              %12 = flow.dispatch.tensor.load %arg1, offsets = [%c0, %c0, %c0, %arg5], sizes = [%c3, %c3, %c3, %11], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
              %13 = affine.min #map4(%arg3)[%workgroup_size_2]
              %14 = affine.min #map4(%arg4)[%workgroup_size_1]
              %15 = affine.min #map3(%arg5)[%workgroup_size_0]
              %16 = linalg.init_tensor [1, %13, %14, %15] : tensor<1x?x?x?xf32>
              %17 = linalg.fill(%16, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
              %18 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%17 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
              flow.dispatch.tensor.store %18, %arg2, offsets = [%c0, %arg3, %arg4, %arg5], sizes = [%c1, %13, %14, %15], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
            }
          }
        }
        return
      }
    }
  }
  func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %0 = flow.ex.stream.fragment(%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %1 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %1 : tensor<1x112x112x32xf32>
    }
    return %0 : tensor<1x112x112x32xf32>
  }
}


// *** IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeInterfacesPass ***
#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
#map1 = affine_map<(d0) -> (d0 * 2)>
#map2 = affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>
#map3 = affine_map<(d0)[s0] -> (s0, -d0 + 32)>
#map4 = affine_map<(d0)[s0] -> (s0, -d0 + 112)>
module  {
  hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
    hal.interface @legacy_io {
      hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.target @vulkan_spirv, filter="vulkan*" {
      hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()}
      module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        func @conv_dispatch_0() {
          %c0 = constant 0 : index
          %cst = constant 0.000000e+00 : f32
          %c32 = constant 32 : index
          %c112 = constant 112 : index
          %c1 = constant 1 : index
          %c3 = constant 3 : index
          %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
          %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
          %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
          %workgroup_size_x = hal.interface.workgroup.size[0] : index
          %workgroup_size_y = hal.interface.workgroup.size[1] : index
          %workgroup_size_z = hal.interface.workgroup.size[2] : index
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %workgroup_id_y = hal.interface.workgroup.id[1] : index
          %workgroup_count_y = hal.interface.workgroup.count[1] : index
          %workgroup_id_z = hal.interface.workgroup.id[2] : index
          %workgroup_count_z = hal.interface.workgroup.count[2] : index
          %3 = affine.apply #map0()[%workgroup_id_z, %workgroup_size_z]
          %4 = affine.apply #map0()[%workgroup_count_z, %workgroup_size_z]
          scf.for %arg0 = %3 to %c112 step %4 {
            %5 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
            %6 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
            scf.for %arg1 = %5 to %c112 step %6 {
              %7 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
              %8 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
              scf.for %arg2 = %7 to %c32 step %8 {
                %9 = affine.apply #map1(%arg0)
                %10 = affine.min #map2(%arg0)[%workgroup_size_z]
                %11 = affine.apply #map1(%arg1)
                %12 = affine.min #map2(%arg1)[%workgroup_size_y]
                %13 = flow.dispatch.tensor.load %0, offsets = [%c0, %9, %11, %c0], sizes = [%c1, %10, %12, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
                %14 = affine.min #map3(%arg2)[%workgroup_size_x]
                %15 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0, %c0, %arg2], sizes = [%c3, %c3, %c3, %14], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
                %16 = affine.min #map4(%arg0)[%workgroup_size_z]
                %17 = affine.min #map4(%arg1)[%workgroup_size_y]
                %18 = affine.min #map3(%arg2)[%workgroup_size_x]
                %19 = linalg.init_tensor [1, %16, %17, %18] : tensor<1x?x?x?xf32>
                %20 = linalg.fill(%19, %cst) : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
                %21 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
                flow.dispatch.tensor.store %21, %2, offsets = [%c0, %arg0, %arg1, %arg2], sizes = [%c1, %16, %17, %18], strides = [%c1, %c1, %c1, %c1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
              }
            }
          }
          return
        }
        hal.interface @legacy_io attributes {sym_visibility = "private"} {
          hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: tensor<1x225x225x3xf32>, %arg1: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %0 = flow.ex.stream.fragment(%arg0, %arg1) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> =
        (%arg2: tensor<1x225x225x3xf32>, %arg3: tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32> {
      %c112 = constant 112 : index
      %c32 = constant 32 : index
      %1 = flow.dispatch @conv_dispatch_0::@conv_dispatch_0[%c32, %c112, %c112](%arg2, %arg3) : (tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) -> tensor<1x112x112x32xf32>
      flow.return %1 : tensor<1x112x112x32xf32>
    }
    return %0 : tensor<1x112x112x32xf32>
  }
}


// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::LinalgBufferizePass ***
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %3 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
  %4 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %5 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
  %workgroup_size_x = hal.interface.workgroup.size[0] : index
  %workgroup_size_y = hal.interface.workgroup.size[1] : index
  %workgroup_size_z = hal.interface.workgroup.size[2] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_count_z = hal.interface.workgroup.count[2] : index
  %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
  scf.for %arg0 = %6 to %c112 step %7 {
    %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
    %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
    scf.for %arg1 = %8 to %c112 step %9 {
      %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
      %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
      scf.for %arg2 = %10 to %c32 step %11 {
        %12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg0)[%workgroup_size_z]
        %14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg1)[%workgroup_size_y]
        %16 = memref.subview %0[%c0, %12, %14, %c0] [%c1, %13, %15, %c3] [%c1, %c1, %c1, %c1] : memref<1x225x225x3xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
        %17 = flow.dispatch.tensor.load %1, offsets = [%c0, %12, %14, %c0], sizes = [%c1, %13, %15, %c3], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x?x?x3xf32>
        %18 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %19 = memref.subview %2[%c0, %c0, %c0, %arg2] [%c3, %c3, %c3, %18] [%c1, %c1, %c1, %c1] : memref<3x3x3x32xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
        %20 = flow.dispatch.tensor.load %3, offsets = [%c0, %c0, %c0, %arg2], sizes = [%c3, %c3, %c3, %18], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x?xf32>
        %21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
        %22 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
        %23 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %24 = memref.subview %4[%c0, %arg0, %arg1, %arg2] [%c1, %21, %22, %23] [%c1, %c1, %c1, %c1] : memref<1x112x112x32xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
        %25 = linalg.init_tensor [1, %21, %22, %23] : tensor<1x?x?x?xf32>
        linalg.fill(%24, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, f32
        %26 = linalg.fill(%25, %cst) {__internal_linalg_transform__ = "workgroup"} : tensor<1x?x?x?xf32>, f32 -> tensor<1x?x?x?xf32>
        linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %19 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>) outs(%24 : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>)
        %27 = linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%17, %20 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%26 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
      }
    }
  }
  return
}

// *** IR Dump After Canonicalizer ***
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %3 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
  %4 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %5 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
  %workgroup_size_x = hal.interface.workgroup.size[0] : index
  %workgroup_size_y = hal.interface.workgroup.size[1] : index
  %workgroup_size_z = hal.interface.workgroup.size[2] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_count_z = hal.interface.workgroup.count[2] : index
  %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
  scf.for %arg0 = %6 to %c112 step %7 {
    %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
    %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
    scf.for %arg1 = %8 to %c112 step %9 {
      %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
      %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
      scf.for %arg2 = %10 to %c32 step %11 {
        %12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg0)[%workgroup_size_z]
        %14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg1)[%workgroup_size_y]
        %16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
        %20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
        %21 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %22 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %21] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.fill(%22, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
        linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%22 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
      }
    }
  }
  return
}

// *** IR Dump After CSE ***
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %3 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x3x32xf32>
  %4 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %5 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
  %workgroup_size_x = hal.interface.workgroup.size[0] : index
  %workgroup_size_y = hal.interface.workgroup.size[1] : index
  %workgroup_size_z = hal.interface.workgroup.size[2] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_count_z = hal.interface.workgroup.count[2] : index
  %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
  scf.for %arg0 = %6 to %c112 step %7 {
    %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
    %9 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
    scf.for %arg1 = %8 to %c112 step %9 {
      %10 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
      %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
      scf.for %arg2 = %10 to %c32 step %11 {
        %12 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %13 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg0)[%workgroup_size_z]
        %14 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %15 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg1)[%workgroup_size_y]
        %16 = memref.subview %0[0, %12, %14, 0] [1, %13, %15, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %18 = memref.subview %2[0, 0, 0, %arg2] [3, 3, 3, %17] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %19 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
        %20 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
        %21 = memref.subview %4[0, %arg0, %arg1, %arg2] [1, %19, %20, %17] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.fill(%21, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
        linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%16, %18 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%21 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
      }
    }
  }
  return
}

// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::BufferAllocViewCleanUpPass ***
func @conv_dispatch_0() {
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_size_x = hal.interface.workgroup.size[0] : index
  %workgroup_size_y = hal.interface.workgroup.size[1] : index
  %workgroup_size_z = hal.interface.workgroup.size[2] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_count_y = hal.interface.workgroup.count[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %workgroup_count_z = hal.interface.workgroup.count[2] : index
  %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
  %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
  scf.for %arg0 = %3 to %c112 step %4 {
    %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
    %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
    scf.for %arg1 = %5 to %c112 step %6 {
      %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
      %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
      scf.for %arg2 = %7 to %c32 step %8 {
        %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %10 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg0)[%workgroup_size_z]
        %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %12 = affine.min affine_map<(d0)[s0] -> (s0 * 2 + 1, d0 * -2 + 225)>(%arg1)[%workgroup_size_y]
        %13 = memref.subview %0[0, %9, %11, 0] [1, %10, %12, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %14 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
        %15 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %14] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg0)[%workgroup_size_z]
        %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 112)>(%arg1)[%workgroup_size_y]
        %18 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %16, %17, %14] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.fill(%18, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
        linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
      }
    }
  }
  return
}

Root op: linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, is_root_op, strides = dense<2> : tensor<2xi64>} ins(%13, %15 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%18 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
Queried workload size: 32, 112, 112, 1
Queried tile size: 16, 4, 4, 0
--- After concretizing hal.interface.workgroup ops ---
func @conv_dispatch_0() {
  %c4 = constant 4 : index
  %c16 = constant 16 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %c32 = constant 32 : index
  %c112 = constant 112 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %c4]
  scf.for %arg0 = %3 to %c112 step %c112 {
    %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c4]
    scf.for %arg1 = %4 to %c112 step %c112 {
      %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c16]
      scf.for %arg2 = %5 to %c32 step %c32 {
        %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
        %7 = affine.min affine_map<(d0)[s0] -> (9, d0 * -2 + 225)>(%arg0)[%c4]
        %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
        %9 = affine.min affine_map<(d0)[s0] -> (9, d0 * -2 + 225)>(%arg1)[%c4]
        %10 = memref.subview %0[0, %6, %8, 0] [1, %7, %9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %11 = affine.min affine_map<(d0)[s0] -> (16, -d0 + 32)>(%arg2)[%c16]
        %12 = memref.subview %1[0, 0, 0, %arg2] [3, 3, 3, %11] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %13 = affine.min affine_map<(d0)[s0] -> (4, -d0 + 112)>(%arg0)[%c4]
        %14 = affine.min affine_map<(d0)[s0] -> (4, -d0 + 112)>(%arg1)[%c4]
        %15 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, %13, %14, %11] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
        linalg.fill(%15, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
        linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%15 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
      }
    }
  }
  return
}

// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ConcretizeTileAmongWorkgroupsPass ***
hal.executable.target @vulkan_spirv, filter="vulkan*" {
  hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
  ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
    %c2 = constant 2 : index
    %c28 = constant 28 : index
    %c28_0 = constant 28 : index
    hal.return %c2, %c28, %c28_0 : index, index, index
  }
  module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
    func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
      %c4 = constant 4 : index
      %c16 = constant 16 : index
      %c0 = constant 0 : index
      %cst = constant 0.000000e+00 : f32
      %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
      %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
      %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_id_z = hal.interface.workgroup.id[2] : index
      %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %c4]
      %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c4]
      %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c16]
      %6 = affine.apply affine_map<(d0) -> (d0 * 2)>(%3)
      %7 = affine.min affine_map<(d0)[s0] -> (9, d0 * -2 + 225)>(%3)[%c4]
      %8 = affine.apply affine_map<(d0) -> (d0 * 2)>(%4)
      %9 = affine.min affine_map<(d0)[s0] -> (9, d0 * -2 + 225)>(%4)[%c4]
      %10 = memref.subview %0[0, %6, %8, 0] [1, %7, %9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %11 = affine.min affine_map<(d0)[s0] -> (16, -d0 + 32)>(%5)[%c16]
      %12 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, %11] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %13 = affine.min affine_map<(d0)[s0] -> (4, -d0 + 112)>(%3)[%c4]
      %14 = affine.min affine_map<(d0)[s0] -> (4, -d0 + 112)>(%4)[%c4]
      %15 = memref.subview %2[0, %3, %4, %5] [1, %13, %14, %11] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      linalg.fill(%15, %cst) {__internal_linalg_transform__ = "workgroup"} : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
      linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, strides = dense<2> : tensor<2xi64>} ins(%10, %12 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%15 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
      return
    }
    hal.interface @legacy_io attributes {sym_visibility = "private"} {
      hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
  }
}


--- IREE Linalg tile and fuse configuration ---
@func conv_dispatch_0: # workgroup sizes: [4, 4, 1]
	linalg.fill : {0 : [0, 4, 4, 16], 1 : [], 2 : [0, 4, 1, 4], 3 : [0, 0, 0, 0, 1, 1, 4]}
	linalg.conv_2d_input_nhwc_filter_hwcf : {0 : [0, 4, 4, 16], 1 : [], 2 : [0, 4, 1, 4], 3 : [0, 0, 0, 0, 1, 1, 4]}
--- After Second level Tiling  ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
  %7 = affine.min affine_map<()[s0] -> (9, s0 * -8 + 225)>()[%workgroup_id_z]
  %8 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %9 = affine.min affine_map<()[s0] -> (9, s0 * -8 + 225)>()[%workgroup_id_y]
  %10 = memref.subview %0[0, %6, %8, 0] [1, %7, %9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %11 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %12 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  linalg.fill(%12, %cst) {__internal_linalg_transform__ = "workgroup", launch_info_key = "__op_num_0__"} : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
  linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, launch_info_key = "__op_num_0__", strides = dense<2> : tensor<2xi64>} ins(%10, %11 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%12 : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
  return
}

--- After Third level Tiling  ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
  %7 = affine.min affine_map<()[s0] -> (9, s0 * -8 + 225)>()[%workgroup_id_z]
  %8 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %9 = affine.min affine_map<()[s0] -> (9, s0 * -8 + 225)>()[%workgroup_id_y]
  %10 = memref.subview %0[0, %6, %8, 0] [1, %7, %9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %11 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %12 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %13 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %14 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %15 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %16 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%15]
  %17 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %18 = memref.subview %12[0, %16, %14, %17] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  linalg.fill(%18, %cst) {__internal_linalg_transform__ = "vectorize", launch_info_key = "__op_num_0__"} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
  %19 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %20 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %21 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %22 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%21]
  %23 = affine.min affine_map<()[s0, s1] -> (9, s0 * -8 + s1)>()[%21, %7]
  %24 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%20]
  %25 = affine.min affine_map<()[s0, s1] -> (3, s0 * -2 + s1)>()[%20, %9]
  %26 = memref.subview %10[0, %22, %24, 0] [1, %23, %25, 3] [1, 1, 1, 1] : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
  %28 = memref.subview %11[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
  %30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
  %31 = memref.subview %12[0, %29, %20, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "tile_conv_filter", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, launch_info_key = "__op_num_0__", strides = dense<2> : tensor<2xi64>} ins(%26, %28 : memref<1x?x?x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%31 : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
  return
}

--- After tiling convolution filter  ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
  %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
  %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  linalg.fill(%16, %cst) {__internal_linalg_transform__ = "vectorize", launch_info_key = "__op_num_0__"} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, f32
  %17 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %18 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%19]
  %21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
  %22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %23 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
  %24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %25 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
  %26 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
  %27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  scf.for %arg0 = %c0 to %c3 step %c1 {
    scf.for %arg1 = %c0 to %c3 step %c1 {
      %28 = memref.subview %22[0, %arg0, %arg1, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %29 = memref.subview %24[%arg0, %arg1, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      linalg.conv_2d_input_nhwc_filter_hwcf {__internal_linalg_transform__ = "vectorize", dilations = dense<1> : tensor<2xi64>, iree.codegen.original_input_types = [memref<1x225x225x3xf32>, memref<3x3x3x32xf32>], iree.codegen.original_output_types = [memref<1x112x112x32xf32>], is_root_op, launch_info_key = "__op_num_0__", strides = dense<2> : tensor<2xi64>} ins(%28, %29 : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>) outs(%27 : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>)
    }
  }
  return
}

--- After Vectorization ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c6 = constant 6 : index
  %cst = constant 0.000000e+00 : f32
  %cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c0 = constant 0 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
  %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
  %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %cst_0, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x4x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %17 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %18 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%19]
  %21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
  %22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %23 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
  %24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %25 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
  %26 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
  %27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  scf.for %arg0 = %c0 to %c3 step %c1 {
    scf.for %arg1 = %c0 to %c3 step %c1 {
      %28 = memref.subview %22[0, %arg0, %arg1, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %29 = memref.subview %24[%arg0, %arg1, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %30 = vector.transfer_read %29[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<3x4xf32>
      %31 = vector.extract_strided_slice %30 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
      %32 = vector.extract_strided_slice %30 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
      %33 = vector.extract_strided_slice %30 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<3x4xf32> to vector<1x4xf32>
      %34 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %35 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %36 = vector.extract_strided_slice %34 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %37 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %36, %31, %35 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %38 = vector.extract_strided_slice %34 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %39 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %38, %32, %37 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %40 = vector.extract_strided_slice %34 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %33, %39 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      vector.transfer_write %41, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %42 = vector.transfer_read %28[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %43 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %44 = vector.extract_strided_slice %42 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %31, %43 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %46 = vector.extract_strided_slice %42 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %32, %45 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %48 = vector.extract_strided_slice %42 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %33, %47 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      vector.transfer_write %49, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %50 = vector.transfer_read %28[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %51 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %52 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %53 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %52, %31, %51 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %54 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %55 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %54, %32, %53 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %56 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %57 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %56, %33, %55 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      vector.transfer_write %57, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %58 = vector.transfer_read %28[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %59 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %60 = vector.extract_strided_slice %58 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %31, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %62 = vector.extract_strided_slice %58 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %32, %61 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %64 = vector.extract_strided_slice %58 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %65 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %64, %33, %63 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      vector.transfer_write %65, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    }
  }
  return
}

--- After Vector Unroll ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c6 = constant 6 : index
  %cst = constant 0.000000e+00 : f32
  %cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c0 = constant 0 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
  %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
  %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %17 = vector.extract_strided_slice %cst_0 {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  %18 = vector.extract_strided_slice %cst_0 {offsets = [0, 1, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  %19 = vector.extract_strided_slice %cst_0 {offsets = [0, 2, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  %20 = vector.extract_strided_slice %cst_0 {offsets = [0, 3, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  vector.transfer_write %17, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %18, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %19, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %20, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %21 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %22 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %23 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%23]
  %25 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%22]
  %26 = memref.subview %8[0, %24, %25, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
  %28 = memref.subview %9[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%23]
  %30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
  %31 = memref.subview %10[0, %29, %22, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  scf.for %arg0 = %c0 to %c3 step %c1 {
    scf.for %arg1 = %c0 to %c3 step %c1 {
      %32 = memref.subview %26[0, %arg0, %arg1, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %33 = memref.subview %28[%arg0, %arg1, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %34 = vector.transfer_read %33[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %35 = vector.transfer_read %33[%c0, %c0, %c1, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %36 = vector.transfer_read %33[%c0, %c0, %c2, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %37 = vector.transfer_read %32[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %38 = vector.transfer_read %31[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %39 = vector.extract_strided_slice %37 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %39, %34, %38 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %41 = vector.extract_strided_slice %37 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %42 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %41, %35, %40 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %43 = vector.extract_strided_slice %37 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %44 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %43, %36, %42 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      vector.transfer_write %44, %31[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %45 = vector.transfer_read %32[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %46 = vector.transfer_read %31[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %47 = vector.extract_strided_slice %45 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %34, %46 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %49 = vector.extract_strided_slice %45 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %35, %48 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %51 = vector.extract_strided_slice %45 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %36, %50 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      vector.transfer_write %52, %31[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %53 = vector.transfer_read %32[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %54 = vector.transfer_read %31[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %55 = vector.extract_strided_slice %53 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %34, %54 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %57 = vector.extract_strided_slice %53 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %58 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %57, %35, %56 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %59 = vector.extract_strided_slice %53 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %60 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %36, %58 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      vector.transfer_write %60, %31[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %61 = vector.transfer_read %32[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %62 = vector.transfer_read %31[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %63 = vector.extract_strided_slice %61 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %64 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %63, %34, %62 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %65 = vector.extract_strided_slice %61 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %35, %64 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %67 = vector.extract_strided_slice %61 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %67, %36, %66 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      vector.transfer_write %68, %31[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    }
  }
  return
}

--- After Hoisting ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c6 = constant 6 : index
  %cst = constant 0.000000e+00 : f32
  %cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c0 = constant 0 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
  %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
  %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %17 = vector.extract_strided_slice %cst_0 {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  %18 = vector.extract_strided_slice %cst_0 {offsets = [0, 1, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  %19 = vector.extract_strided_slice %cst_0 {offsets = [0, 2, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  %20 = vector.extract_strided_slice %cst_0 {offsets = [0, 3, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  vector.transfer_write %17, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %18, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %19, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %20, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %21 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %22 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %23 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%23]
  %25 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%22]
  %26 = memref.subview %8[0, %24, %25, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
  %28 = memref.subview %9[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%23]
  %30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
  %31 = memref.subview %10[0, %29, %22, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %32 = vector.transfer_read %31[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %33 = vector.transfer_read %31[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %34 = vector.transfer_read %31[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %35 = vector.transfer_read %31[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %36:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %32, %arg2 = %33, %arg3 = %34, %arg4 = %35) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
    %37:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %38 = memref.subview %26[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %39 = memref.subview %28[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %40 = vector.transfer_read %39[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %41 = vector.transfer_read %39[%c0, %c0, %c1, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %42 = vector.transfer_read %39[%c0, %c0, %c2, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %43 = vector.transfer_read %38[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %44 = vector.extract_strided_slice %43 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %40, %arg6 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %46 = vector.extract_strided_slice %43 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %41, %45 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %48 = vector.extract_strided_slice %43 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %42, %47 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %50 = vector.transfer_read %38[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %51 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %40, %arg7 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %53 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %53, %41, %52 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %55 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %42, %54 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %57 = vector.transfer_read %38[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %58 = vector.extract_strided_slice %57 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %40, %arg8 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %60 = vector.extract_strided_slice %57 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %41, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %62 = vector.extract_strided_slice %57 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %42, %61 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %64 = vector.transfer_read %38[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %65 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %40, %arg9 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %67 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %67, %41, %66 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %69 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %70 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %69, %42, %68 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      scf.yield %49, %56, %63, %70 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    scf.yield %37#0, %37#1, %37#2, %37#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
  }
  vector.transfer_write %36#3, %31[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %36#2, %31[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %36#1, %31[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %36#0, %31[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  return
}

--- After generalization ---
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c6 = constant 6 : index
  %cst = constant 0.000000e+00 : f32
  %cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c0 = constant 0 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
  %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
  %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %17 = vector.extract_strided_slice %cst_0 {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  %18 = vector.extract_strided_slice %cst_0 {offsets = [0, 1, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  %19 = vector.extract_strided_slice %cst_0 {offsets = [0, 2, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  %20 = vector.extract_strided_slice %cst_0 {offsets = [0, 3, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
  vector.transfer_write %17, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %18, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %19, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %20, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %21 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %22 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %23 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%23]
  %25 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%22]
  %26 = memref.subview %8[0, %24, %25, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
  %28 = memref.subview %9[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%23]
  %30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
  %31 = memref.subview %10[0, %29, %22, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %32 = vector.transfer_read %31[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %33 = vector.transfer_read %31[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %34 = vector.transfer_read %31[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %35 = vector.transfer_read %31[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %36:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %32, %arg2 = %33, %arg3 = %34, %arg4 = %35) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
    %37:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %38 = memref.subview %26[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %39 = memref.subview %28[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %40 = vector.transfer_read %39[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %41 = vector.transfer_read %39[%c0, %c0, %c1, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %42 = vector.transfer_read %39[%c0, %c0, %c2, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %43 = vector.transfer_read %38[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %44 = vector.extract_strided_slice %43 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %40, %arg6 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %46 = vector.extract_strided_slice %43 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %41, %45 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %48 = vector.extract_strided_slice %43 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %42, %47 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %50 = vector.transfer_read %38[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %51 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %40, %arg7 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %53 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %53, %41, %52 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %55 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %42, %54 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %57 = vector.transfer_read %38[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %58 = vector.extract_strided_slice %57 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %40, %arg8 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %60 = vector.extract_strided_slice %57 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %41, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %62 = vector.extract_strided_slice %57 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %42, %61 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %64 = vector.transfer_read %38[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %65 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %40, %arg9 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %67 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %67, %41, %66 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      %69 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
      %70 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %69, %42, %68 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
      scf.yield %49, %56, %63, %70 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    scf.yield %37#0, %37#1, %37#2, %37#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
  }
  vector.transfer_write %36#3, %31[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %36#2, %31[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %36#1, %31[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %36#0, %31[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  return
}

// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::TileAndVectorizeInOneWorkgroupPass ***
hal.executable.target @vulkan_spirv, filter="vulkan*" {
  hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
  ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
    %c2 = constant 2 : index
    %c28 = constant 28 : index
    %c28_0 = constant 28 : index
    hal.return %c2, %c28, %c28_0 : index, index, index
  }
  module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
    func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
      %c4 = constant 4 : index
      %c2 = constant 2 : index
      %c6 = constant 6 : index
      %cst = constant 0.000000e+00 : f32
      %cst_0 = constant dense<0.000000e+00> : vector<1x4x1x4xf32>
      %c1 = constant 1 : index
      %c3 = constant 3 : index
      %c0 = constant 0 : index
      %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
      %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
      %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_id_z = hal.interface.workgroup.id[2] : index
      %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
      %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
      %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
      %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
      %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
      %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
      %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
      %14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
      %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
      %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %17 = vector.extract_strided_slice %cst_0 {offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
      %18 = vector.extract_strided_slice %cst_0 {offsets = [0, 1, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
      %19 = vector.extract_strided_slice %cst_0 {offsets = [0, 2, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
      %20 = vector.extract_strided_slice %cst_0 {offsets = [0, 3, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1]} : vector<1x4x1x4xf32> to vector<1x1x1x4xf32>
      vector.transfer_write %17, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %18, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %19, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %20, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %21 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %22 = "gpu.thread_id"() {dimension = "y"} : () -> index
      %23 = "gpu.thread_id"() {dimension = "z"} : () -> index
      %24 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%23]
      %25 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%22]
      %26 = memref.subview %8[0, %24, %25, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %27 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
      %28 = memref.subview %9[0, 0, 0, %27] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %29 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%23]
      %30 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%21]
      %31 = memref.subview %10[0, %29, %22, %30] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      %32 = vector.transfer_read %31[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %33 = vector.transfer_read %31[%c0, %c1, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %34 = vector.transfer_read %31[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %35 = vector.transfer_read %31[%c0, %c3, %c0, %c0], %cst {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
      %36:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %32, %arg2 = %33, %arg3 = %34, %arg4 = %35) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %37:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
          %38 = memref.subview %26[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
          %39 = memref.subview %28[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
          %40 = vector.transfer_read %39[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %41 = vector.transfer_read %39[%c0, %c0, %c1, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %42 = vector.transfer_read %39[%c0, %c0, %c2, %c0], %cst {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
          %43 = vector.transfer_read %38[%c0, %c0, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %44 = vector.extract_strided_slice %43 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %40, %arg6 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %46 = vector.extract_strided_slice %43 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %47 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %46, %41, %45 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %48 = vector.extract_strided_slice %43 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %49 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %48, %42, %47 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %50 = vector.transfer_read %38[%c0, %c2, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %51 = vector.extract_strided_slice %50 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %40, %arg7 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %53 = vector.extract_strided_slice %50 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %53, %41, %52 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %55 = vector.extract_strided_slice %50 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %56 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %55, %42, %54 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %57 = vector.transfer_read %38[%c0, %c4, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %58 = vector.extract_strided_slice %57 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %40, %arg8 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %60 = vector.extract_strided_slice %57 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %61 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %60, %41, %59 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %62 = vector.extract_strided_slice %57 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %63 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %62, %42, %61 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %64 = vector.transfer_read %38[%c0, %c6, %c0, %c0], %cst {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
          %65 = vector.extract_strided_slice %64 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %40, %arg9 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %67 = vector.extract_strided_slice %64 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %68 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %67, %41, %66 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          %69 = vector.extract_strided_slice %64 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
          %70 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %69, %42, %68 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
          scf.yield %49, %56, %63, %70 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
        }
        scf.yield %37#0, %37#1, %37#2, %37#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      vector.transfer_write %36#3, %31[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %36#2, %31[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %36#1, %31[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      vector.transfer_write %36#0, %31[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
      return
    }
    hal.interface @legacy_io attributes {sym_visibility = "private"} {
      hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
  }
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
    %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
    %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
    %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
    %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
    %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
    %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %17 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %18 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%19]
    %21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
    %22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %23 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
    %24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %25 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
    %26 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
    %27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %29 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %30 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %31 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %32:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %28, %arg2 = %29, %arg3 = %30, %arg4 = %31) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %34 = memref.subview %22[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %35 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %40 = vector.extract_strided_slice %39 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %40, %36, %arg6 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %42 = vector.extract_strided_slice %39 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %43 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %42, %37, %41 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %44 = vector.extract_strided_slice %39 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %45 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %44, %38, %43 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %46 = vector.transfer_read %34[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %47 = vector.extract_strided_slice %46 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %48 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %47, %36, %arg7 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %49 = vector.extract_strided_slice %46 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %50 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %37, %48 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %51 = vector.extract_strided_slice %46 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %52 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %51, %38, %50 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %53 = vector.transfer_read %34[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %54 = vector.extract_strided_slice %53 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %55 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %54, %36, %arg8 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %56 = vector.extract_strided_slice %53 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %57 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %56, %37, %55 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %58 = vector.extract_strided_slice %53 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %59 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %58, %38, %57 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %60 = vector.transfer_read %34[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %61 = vector.extract_strided_slice %60 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %62 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %61, %36, %arg9 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %63 = vector.extract_strided_slice %60 {offsets = [0, 1], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %64 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %63, %37, %62 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        %65 = vector.extract_strided_slice %60 {offsets = [0, 2], sizes = [1, 1], strides = [1, 1]} : vector<1x3xf32> to vector<1x1xf32>
        %66 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %65, %38, %64 : vector<1x1xf32>, vector<1x4xf32> into vector<1x4xf32>
        scf.yield %45, %52, %59, %66 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    vector.transfer_write %32#3, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %32#2, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %32#1, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %32#0, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ConvertVectorToGPUPass ***
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c6 = constant 6 : index
  %cst_0 = constant 0.000000e+00 : f32
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c0 = constant 0 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
  %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_z]
  %7 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
  %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %14 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%13]
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%11]
  %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %17 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %18 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %20 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%19]
  %21 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%18]
  %22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %23 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
  %24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %25 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%19]
  %26 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%17]
  %27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %29 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %30 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %31 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
  %32:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %28, %arg2 = %29, %arg3 = %30, %arg4 = %31) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
    %33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %34 = memref.subview %22[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %35 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
      %39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %40 = vector.extract %39[0, 0] : vector<1x3xf32>
      %41 = vector.broadcast %40 : f32 to vector<4xf32>
      %42 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
      %43 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
      %44 = mulf %41, %42 : vector<4xf32>
      %45 = addf %44, %43 : vector<4xf32>
      %46 = vector.extract %39[0, 1] : vector<1x3xf32>
      %47 = vector.broadcast %46 : f32 to vector<4xf32>
      %48 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
      %49 = mulf %47, %48 : vector<4xf32>
      %50 = addf %49, %45 : vector<4xf32>
      %51 = vector.extract %39[0, 2] : vector<1x3xf32>
      %52 = vector.broadcast %51 : f32 to vector<4xf32>
      %53 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
      %54 = mulf %52, %53 : vector<4xf32>
      %55 = addf %54, %50 : vector<4xf32>
      %56 = vector.shape_cast %55 : vector<4xf32> to vector<1x4xf32>
      %57 = vector.transfer_read %34[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %58 = vector.extract %57[0, 0] : vector<1x3xf32>
      %59 = vector.broadcast %58 : f32 to vector<4xf32>
      %60 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
      %61 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
      %62 = mulf %59, %60 : vector<4xf32>
      %63 = addf %62, %61 : vector<4xf32>
      %64 = vector.extract %57[0, 1] : vector<1x3xf32>
      %65 = vector.broadcast %64 : f32 to vector<4xf32>
      %66 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
      %67 = mulf %65, %66 : vector<4xf32>
      %68 = addf %67, %63 : vector<4xf32>
      %69 = vector.extract %57[0, 2] : vector<1x3xf32>
      %70 = vector.broadcast %69 : f32 to vector<4xf32>
      %71 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
      %72 = mulf %70, %71 : vector<4xf32>
      %73 = addf %72, %68 : vector<4xf32>
      %74 = vector.shape_cast %73 : vector<4xf32> to vector<1x4xf32>
      %75 = vector.transfer_read %34[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %76 = vector.extract %75[0, 0] : vector<1x3xf32>
      %77 = vector.broadcast %76 : f32 to vector<4xf32>
      %78 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
      %79 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
      %80 = mulf %77, %78 : vector<4xf32>
      %81 = addf %80, %79 : vector<4xf32>
      %82 = vector.extract %75[0, 1] : vector<1x3xf32>
      %83 = vector.broadcast %82 : f32 to vector<4xf32>
      %84 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
      %85 = mulf %83, %84 : vector<4xf32>
      %86 = addf %85, %81 : vector<4xf32>
      %87 = vector.extract %75[0, 2] : vector<1x3xf32>
      %88 = vector.broadcast %87 : f32 to vector<4xf32>
      %89 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
      %90 = mulf %88, %89 : vector<4xf32>
      %91 = addf %90, %86 : vector<4xf32>
      %92 = vector.shape_cast %91 : vector<4xf32> to vector<1x4xf32>
      %93 = vector.transfer_read %34[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
      %94 = vector.extract %93[0, 0] : vector<1x3xf32>
      %95 = vector.broadcast %94 : f32 to vector<4xf32>
      %96 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
      %97 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
      %98 = mulf %95, %96 : vector<4xf32>
      %99 = addf %98, %97 : vector<4xf32>
      %100 = vector.extract %93[0, 1] : vector<1x3xf32>
      %101 = vector.broadcast %100 : f32 to vector<4xf32>
      %102 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
      %103 = mulf %101, %102 : vector<4xf32>
      %104 = addf %103, %99 : vector<4xf32>
      %105 = vector.extract %93[0, 2] : vector<1x3xf32>
      %106 = vector.broadcast %105 : f32 to vector<4xf32>
      %107 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
      %108 = mulf %106, %107 : vector<4xf32>
      %109 = addf %108, %104 : vector<4xf32>
      %110 = vector.shape_cast %109 : vector<4xf32> to vector<1x4xf32>
      scf.yield %56, %74, %92, %110 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
  }
  vector.transfer_write %32#3, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %32#2, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %32#1, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  vector.transfer_write %32#0, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  return
}

// *** IR Dump After ConvertAffineToStandard ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %c4_1 = constant 4 : index
    %3 = muli %workgroup_id_z, %c4_1 : index
    %c4_2 = constant 4 : index
    %4 = muli %workgroup_id_y, %c4_2 : index
    %c16 = constant 16 : index
    %5 = muli %workgroup_id_x, %c16 : index
    %c8 = constant 8 : index
    %6 = muli %workgroup_id_z, %c8 : index
    %c8_3 = constant 8 : index
    %7 = muli %workgroup_id_y, %c8_3 : index
    %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %c4_4 = constant 4 : index
    %14 = muli %13, %c4_4 : index
    %c4_5 = constant 4 : index
    %15 = muli %11, %c4_5 : index
    %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %17 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %18 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %c8_6 = constant 8 : index
    %20 = muli %19, %c8_6 : index
    %c2_7 = constant 2 : index
    %21 = muli %18, %c2_7 : index
    %22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %c4_8 = constant 4 : index
    %23 = muli %17, %c4_8 : index
    %24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %c4_9 = constant 4 : index
    %25 = muli %19, %c4_9 : index
    %c4_10 = constant 4 : index
    %26 = muli %17, %c4_10 : index
    %27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %29 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %30 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %31 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %32:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %28, %arg2 = %29, %arg3 = %30, %arg4 = %31) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %34 = memref.subview %22[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %35 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %40 = vector.extract %39[0, 0] : vector<1x3xf32>
        %41 = vector.broadcast %40 : f32 to vector<4xf32>
        %42 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %43 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
        %44 = mulf %41, %42 : vector<4xf32>
        %45 = addf %44, %43 : vector<4xf32>
        %46 = vector.extract %39[0, 1] : vector<1x3xf32>
        %47 = vector.broadcast %46 : f32 to vector<4xf32>
        %48 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
        %49 = mulf %47, %48 : vector<4xf32>
        %50 = addf %49, %45 : vector<4xf32>
        %51 = vector.extract %39[0, 2] : vector<1x3xf32>
        %52 = vector.broadcast %51 : f32 to vector<4xf32>
        %53 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
        %54 = mulf %52, %53 : vector<4xf32>
        %55 = addf %54, %50 : vector<4xf32>
        %56 = vector.shape_cast %55 : vector<4xf32> to vector<1x4xf32>
        %57 = vector.transfer_read %34[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %58 = vector.extract %57[0, 0] : vector<1x3xf32>
        %59 = vector.broadcast %58 : f32 to vector<4xf32>
        %60 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %61 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %62 = mulf %59, %60 : vector<4xf32>
        %63 = addf %62, %61 : vector<4xf32>
        %64 = vector.extract %57[0, 1] : vector<1x3xf32>
        %65 = vector.broadcast %64 : f32 to vector<4xf32>
        %66 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
        %67 = mulf %65, %66 : vector<4xf32>
        %68 = addf %67, %63 : vector<4xf32>
        %69 = vector.extract %57[0, 2] : vector<1x3xf32>
        %70 = vector.broadcast %69 : f32 to vector<4xf32>
        %71 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
        %72 = mulf %70, %71 : vector<4xf32>
        %73 = addf %72, %68 : vector<4xf32>
        %74 = vector.shape_cast %73 : vector<4xf32> to vector<1x4xf32>
        %75 = vector.transfer_read %34[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %76 = vector.extract %75[0, 0] : vector<1x3xf32>
        %77 = vector.broadcast %76 : f32 to vector<4xf32>
        %78 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %79 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %80 = mulf %77, %78 : vector<4xf32>
        %81 = addf %80, %79 : vector<4xf32>
        %82 = vector.extract %75[0, 1] : vector<1x3xf32>
        %83 = vector.broadcast %82 : f32 to vector<4xf32>
        %84 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
        %85 = mulf %83, %84 : vector<4xf32>
        %86 = addf %85, %81 : vector<4xf32>
        %87 = vector.extract %75[0, 2] : vector<1x3xf32>
        %88 = vector.broadcast %87 : f32 to vector<4xf32>
        %89 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
        %90 = mulf %88, %89 : vector<4xf32>
        %91 = addf %90, %86 : vector<4xf32>
        %92 = vector.shape_cast %91 : vector<4xf32> to vector<1x4xf32>
        %93 = vector.transfer_read %34[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %94 = vector.extract %93[0, 0] : vector<1x3xf32>
        %95 = vector.broadcast %94 : f32 to vector<4xf32>
        %96 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %97 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %98 = mulf %95, %96 : vector<4xf32>
        %99 = addf %98, %97 : vector<4xf32>
        %100 = vector.extract %93[0, 1] : vector<1x3xf32>
        %101 = vector.broadcast %100 : f32 to vector<4xf32>
        %102 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
        %103 = mulf %101, %102 : vector<4xf32>
        %104 = addf %103, %99 : vector<4xf32>
        %105 = vector.extract %93[0, 2] : vector<1x3xf32>
        %106 = vector.broadcast %105 : f32 to vector<4xf32>
        %107 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
        %108 = mulf %106, %107 : vector<4xf32>
        %109 = addf %108, %104 : vector<4xf32>
        %110 = vector.shape_cast %109 : vector<4xf32> to vector<1x4xf32>
        scf.yield %56, %74, %92, %110 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    vector.transfer_write %32#3, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %32#2, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %32#1, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %32#0, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %c16 = constant 16 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_z, %c4 : index
    %4 = muli %workgroup_id_y, %c4 : index
    %5 = muli %workgroup_id_x, %c16 : index
    %6 = muli %workgroup_id_z, %c8 : index
    %7 = muli %workgroup_id_y, %c8 : index
    %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %14 = muli %13, %c4 : index
    %15 = muli %11, %c4 : index
    %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %17 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %18 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %20 = muli %19, %c8 : index
    %21 = muli %18, %c2 : index
    %22 = memref.subview %8[0, %20, %21, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %23 = muli %17, %c4 : index
    %24 = memref.subview %9[0, 0, 0, %23] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %25 = muli %19, %c4 : index
    %26 = muli %17, %c4 : index
    %27 = memref.subview %10[0, %25, %18, %26] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %28 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %29 = vector.transfer_read %27[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %30 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %31 = vector.transfer_read %27[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %32:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %28, %arg2 = %29, %arg3 = %30, %arg4 = %31) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %34 = memref.subview %22[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %35 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %36 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %37 = vector.transfer_read %35[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %38 = vector.transfer_read %35[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %39 = vector.transfer_read %34[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %40 = vector.extract %39[0, 0] : vector<1x3xf32>
        %41 = vector.broadcast %40 : f32 to vector<4xf32>
        %42 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %43 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
        %44 = mulf %41, %42 : vector<4xf32>
        %45 = addf %44, %43 : vector<4xf32>
        %46 = vector.extract %39[0, 1] : vector<1x3xf32>
        %47 = vector.broadcast %46 : f32 to vector<4xf32>
        %48 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
        %49 = mulf %47, %48 : vector<4xf32>
        %50 = addf %49, %45 : vector<4xf32>
        %51 = vector.extract %39[0, 2] : vector<1x3xf32>
        %52 = vector.broadcast %51 : f32 to vector<4xf32>
        %53 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
        %54 = mulf %52, %53 : vector<4xf32>
        %55 = addf %54, %50 : vector<4xf32>
        %56 = vector.shape_cast %55 : vector<4xf32> to vector<1x4xf32>
        %57 = vector.transfer_read %34[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %58 = vector.extract %57[0, 0] : vector<1x3xf32>
        %59 = vector.broadcast %58 : f32 to vector<4xf32>
        %60 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %61 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %62 = mulf %59, %60 : vector<4xf32>
        %63 = addf %62, %61 : vector<4xf32>
        %64 = vector.extract %57[0, 1] : vector<1x3xf32>
        %65 = vector.broadcast %64 : f32 to vector<4xf32>
        %66 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
        %67 = mulf %65, %66 : vector<4xf32>
        %68 = addf %67, %63 : vector<4xf32>
        %69 = vector.extract %57[0, 2] : vector<1x3xf32>
        %70 = vector.broadcast %69 : f32 to vector<4xf32>
        %71 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
        %72 = mulf %70, %71 : vector<4xf32>
        %73 = addf %72, %68 : vector<4xf32>
        %74 = vector.shape_cast %73 : vector<4xf32> to vector<1x4xf32>
        %75 = vector.transfer_read %34[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %76 = vector.extract %75[0, 0] : vector<1x3xf32>
        %77 = vector.broadcast %76 : f32 to vector<4xf32>
        %78 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %79 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %80 = mulf %77, %78 : vector<4xf32>
        %81 = addf %80, %79 : vector<4xf32>
        %82 = vector.extract %75[0, 1] : vector<1x3xf32>
        %83 = vector.broadcast %82 : f32 to vector<4xf32>
        %84 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
        %85 = mulf %83, %84 : vector<4xf32>
        %86 = addf %85, %81 : vector<4xf32>
        %87 = vector.extract %75[0, 2] : vector<1x3xf32>
        %88 = vector.broadcast %87 : f32 to vector<4xf32>
        %89 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
        %90 = mulf %88, %89 : vector<4xf32>
        %91 = addf %90, %86 : vector<4xf32>
        %92 = vector.shape_cast %91 : vector<4xf32> to vector<1x4xf32>
        %93 = vector.transfer_read %34[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %94 = vector.extract %93[0, 0] : vector<1x3xf32>
        %95 = vector.broadcast %94 : f32 to vector<4xf32>
        %96 = vector.shape_cast %36 : vector<1x4xf32> to vector<4xf32>
        %97 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %98 = mulf %95, %96 : vector<4xf32>
        %99 = addf %98, %97 : vector<4xf32>
        %100 = vector.extract %93[0, 1] : vector<1x3xf32>
        %101 = vector.broadcast %100 : f32 to vector<4xf32>
        %102 = vector.shape_cast %37 : vector<1x4xf32> to vector<4xf32>
        %103 = mulf %101, %102 : vector<4xf32>
        %104 = addf %103, %99 : vector<4xf32>
        %105 = vector.extract %93[0, 2] : vector<1x3xf32>
        %106 = vector.broadcast %105 : f32 to vector<4xf32>
        %107 = vector.shape_cast %38 : vector<1x4xf32> to vector<4xf32>
        %108 = mulf %106, %107 : vector<4xf32>
        %109 = addf %108, %104 : vector<4xf32>
        %110 = vector.shape_cast %109 : vector<4xf32> to vector<1x4xf32>
        scf.yield %56, %74, %92, %110 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %33#0, %33#1, %33#2, %33#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    vector.transfer_write %32#3, %27[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %32#2, %27[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %32#1, %27[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %32#0, %27[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %c16 = constant 16 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_z, %c4 : index
    %4 = muli %workgroup_id_y, %c4 : index
    %5 = muli %workgroup_id_x, %c16 : index
    %6 = muli %workgroup_id_z, %c8 : index
    %7 = muli %workgroup_id_y, %c8 : index
    %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %14 = muli %13, %c4 : index
    %15 = muli %11, %c4 : index
    %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c0, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c1, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c2, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %cst, %16[%c0, %c3, %c0, %c0] {masked = [false, false, false, false]} : vector<1x1x1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    %17 = muli %13, %c8 : index
    %18 = muli %12, %c2 : index
    %19 = memref.subview %8[0, %17, %18, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
    %20 = memref.subview %9[0, 0, 0, %15] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
    %21 = vector.transfer_read %16[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %22 = vector.transfer_read %16[%c0, %c1, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %23 = vector.transfer_read %16[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %24 = vector.transfer_read %16[%c0, %c3, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>, vector<1x4xf32>
    %25:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %21, %arg2 = %22, %arg3 = %23, %arg4 = %24) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %26:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %27 = memref.subview %19[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
        %28 = memref.subview %20[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
        %29 = vector.transfer_read %28[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %30 = vector.transfer_read %28[%c0, %c0, %c1, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %31 = vector.transfer_read %28[%c0, %c0, %c2, %c0], %cst_0 {masked = [false, false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<1x4xf32>
        %32 = vector.transfer_read %27[%c0, %c0, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %33 = vector.extract %32[0, 0] : vector<1x3xf32>
        %34 = vector.broadcast %33 : f32 to vector<4xf32>
        %35 = vector.shape_cast %29 : vector<1x4xf32> to vector<4xf32>
        %36 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
        %37 = mulf %34, %35 : vector<4xf32>
        %38 = addf %37, %36 : vector<4xf32>
        %39 = vector.extract %32[0, 1] : vector<1x3xf32>
        %40 = vector.broadcast %39 : f32 to vector<4xf32>
        %41 = vector.shape_cast %30 : vector<1x4xf32> to vector<4xf32>
        %42 = mulf %40, %41 : vector<4xf32>
        %43 = addf %42, %38 : vector<4xf32>
        %44 = vector.extract %32[0, 2] : vector<1x3xf32>
        %45 = vector.broadcast %44 : f32 to vector<4xf32>
        %46 = vector.shape_cast %31 : vector<1x4xf32> to vector<4xf32>
        %47 = mulf %45, %46 : vector<4xf32>
        %48 = addf %47, %43 : vector<4xf32>
        %49 = vector.shape_cast %48 : vector<4xf32> to vector<1x4xf32>
        %50 = vector.transfer_read %27[%c0, %c2, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %51 = vector.extract %50[0, 0] : vector<1x3xf32>
        %52 = vector.broadcast %51 : f32 to vector<4xf32>
        %53 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %54 = mulf %52, %35 : vector<4xf32>
        %55 = addf %54, %53 : vector<4xf32>
        %56 = vector.extract %50[0, 1] : vector<1x3xf32>
        %57 = vector.broadcast %56 : f32 to vector<4xf32>
        %58 = mulf %57, %41 : vector<4xf32>
        %59 = addf %58, %55 : vector<4xf32>
        %60 = vector.extract %50[0, 2] : vector<1x3xf32>
        %61 = vector.broadcast %60 : f32 to vector<4xf32>
        %62 = mulf %61, %46 : vector<4xf32>
        %63 = addf %62, %59 : vector<4xf32>
        %64 = vector.shape_cast %63 : vector<4xf32> to vector<1x4xf32>
        %65 = vector.transfer_read %27[%c0, %c4, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %66 = vector.extract %65[0, 0] : vector<1x3xf32>
        %67 = vector.broadcast %66 : f32 to vector<4xf32>
        %68 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %69 = mulf %67, %35 : vector<4xf32>
        %70 = addf %69, %68 : vector<4xf32>
        %71 = vector.extract %65[0, 1] : vector<1x3xf32>
        %72 = vector.broadcast %71 : f32 to vector<4xf32>
        %73 = mulf %72, %41 : vector<4xf32>
        %74 = addf %73, %70 : vector<4xf32>
        %75 = vector.extract %65[0, 2] : vector<1x3xf32>
        %76 = vector.broadcast %75 : f32 to vector<4xf32>
        %77 = mulf %76, %46 : vector<4xf32>
        %78 = addf %77, %74 : vector<4xf32>
        %79 = vector.shape_cast %78 : vector<4xf32> to vector<1x4xf32>
        %80 = vector.transfer_read %27[%c0, %c6, %c0, %c0], %cst_0 {masked = [false, false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<1x3xf32>
        %81 = vector.extract %80[0, 0] : vector<1x3xf32>
        %82 = vector.broadcast %81 : f32 to vector<4xf32>
        %83 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %84 = mulf %82, %35 : vector<4xf32>
        %85 = addf %84, %83 : vector<4xf32>
        %86 = vector.extract %80[0, 1] : vector<1x3xf32>
        %87 = vector.broadcast %86 : f32 to vector<4xf32>
        %88 = mulf %87, %41 : vector<4xf32>
        %89 = addf %88, %85 : vector<4xf32>
        %90 = vector.extract %80[0, 2] : vector<1x3xf32>
        %91 = vector.broadcast %90 : f32 to vector<4xf32>
        %92 = mulf %91, %46 : vector<4xf32>
        %93 = addf %92, %89 : vector<4xf32>
        %94 = vector.shape_cast %93 : vector<4xf32> to vector<1x4xf32>
        scf.yield %49, %64, %79, %94 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %26#0, %26#1, %26#2, %26#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    vector.transfer_write %25#3, %16[%c0, %c3, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %25#2, %16[%c0, %c2, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %25#1, %16[%c0, %c1, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    vector.transfer_write %25#0, %16[%c0, %c0, %c0, %c0] {masked = [false, false]} : vector<1x4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::VectorTransferOptimizationPass ***
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c6 = constant 6 : index
  %cst_0 = constant 0.000000e+00 : f32
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c0 = constant 0 : index
  %c16 = constant 16 : index
  %c8 = constant 8 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = muli %workgroup_id_z, %c4 : index
  %4 = muli %workgroup_id_y, %c4 : index
  %5 = muli %workgroup_id_x, %c16 : index
  %6 = muli %workgroup_id_z, %c8 : index
  %7 = muli %workgroup_id_y, %c8 : index
  %8 = memref.subview %0[0, %6, %7, 0] [1, 9, 9, 3] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %9 = memref.subview %1[0, 0, 0, %5] [3, 3, 3, 16] [1, 1, 1, 1] : memref<3x3x3x32xf32> to memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %10 = memref.subview %2[0, %3, %4, %5] [1, 4, 4, 16] [1, 1, 1, 1] : memref<1x112x112x32xf32> to memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %13 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %14 = muli %13, %c4 : index
  %15 = muli %11, %c4 : index
  %16 = memref.subview %10[0, %14, %12, %15] [1, 4, 1, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>> to memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %17 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<4xf32>
  %18 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<4xf32>
  %19 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<4xf32>
  %20 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<4xf32>
  %21 = muli %13, %c8 : index
  %22 = muli %12, %c2 : index
  %23 = memref.subview %8[0, %21, %22, 0] [1, 9, 3, 3] [1, 1, 1, 1] : memref<1x9x9x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
  %24 = memref.subview %9[0, 0, 0, %15] [3, 3, 3, 4] [1, 1, 1, 1] : memref<3x3x3x16xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
  %25 = vector.shape_cast %17 : vector<4xf32> to vector<1x4xf32>
  %26 = vector.shape_cast %18 : vector<4xf32> to vector<1x4xf32>
  %27 = vector.shape_cast %19 : vector<4xf32> to vector<1x4xf32>
  %28 = vector.shape_cast %20 : vector<4xf32> to vector<1x4xf32>
  %29:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %25, %arg2 = %26, %arg3 = %27, %arg4 = %28) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
    %34:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %35 = memref.subview %23[0, %arg0, %arg5, 0] [1, 7, 1, 3] [1, 1, 1, 1] : memref<1x9x3x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>> to memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
      %36 = memref.subview %24[%arg0, %arg5, 0, 0] [1, 1, 3, 4] [1, 1, 1, 1] : memref<3x3x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>> to memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>
      %37 = vector.transfer_read %36[%c0, %c0, %c0, %c0], %cst_0 {masked = [false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
      %38 = vector.transfer_read %36[%c0, %c0, %c1, %c0], %cst_0 {masked = [false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
      %39 = vector.transfer_read %36[%c0, %c0, %c2, %c0], %cst_0 {masked = [false]} : memref<1x1x3x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 288 + s0 + d1 * 96 + d2 * 32 + d3)>>, vector<4xf32>
      %40 = vector.transfer_read %35[%c0, %c0, %c0, %c0], %cst_0 {masked = [false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
      %41 = vector.extract %40[0] : vector<3xf32>
      %42 = vector.broadcast %41 : f32 to vector<4xf32>
      %43 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
      %44 = mulf %42, %37 : vector<4xf32>
      %45 = addf %44, %43 : vector<4xf32>
      %46 = vector.extract %40[1] : vector<3xf32>
      %47 = vector.broadcast %46 : f32 to vector<4xf32>
      %48 = mulf %47, %38 : vector<4xf32>
      %49 = addf %48, %45 : vector<4xf32>
      %50 = vector.extract %40[2] : vector<3xf32>
      %51 = vector.broadcast %50 : f32 to vector<4xf32>
      %52 = mulf %51, %39 : vector<4xf32>
      %53 = addf %52, %49 : vector<4xf32>
      %54 = vector.shape_cast %53 : vector<4xf32> to vector<1x4xf32>
      %55 = vector.transfer_read %35[%c0, %c2, %c0, %c0], %cst_0 {masked = [false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
      %56 = vector.extract %55[0] : vector<3xf32>
      %57 = vector.broadcast %56 : f32 to vector<4xf32>
      %58 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
      %59 = mulf %57, %37 : vector<4xf32>
      %60 = addf %59, %58 : vector<4xf32>
      %61 = vector.extract %55[1] : vector<3xf32>
      %62 = vector.broadcast %61 : f32 to vector<4xf32>
      %63 = mulf %62, %38 : vector<4xf32>
      %64 = addf %63, %60 : vector<4xf32>
      %65 = vector.extract %55[2] : vector<3xf32>
      %66 = vector.broadcast %65 : f32 to vector<4xf32>
      %67 = mulf %66, %39 : vector<4xf32>
      %68 = addf %67, %64 : vector<4xf32>
      %69 = vector.shape_cast %68 : vector<4xf32> to vector<1x4xf32>
      %70 = vector.transfer_read %35[%c0, %c4, %c0, %c0], %cst_0 {masked = [false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
      %71 = vector.extract %70[0] : vector<3xf32>
      %72 = vector.broadcast %71 : f32 to vector<4xf32>
      %73 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
      %74 = mulf %72, %37 : vector<4xf32>
      %75 = addf %74, %73 : vector<4xf32>
      %76 = vector.extract %70[1] : vector<3xf32>
      %77 = vector.broadcast %76 : f32 to vector<4xf32>
      %78 = mulf %77, %38 : vector<4xf32>
      %79 = addf %78, %75 : vector<4xf32>
      %80 = vector.extract %70[2] : vector<3xf32>
      %81 = vector.broadcast %80 : f32 to vector<4xf32>
      %82 = mulf %81, %39 : vector<4xf32>
      %83 = addf %82, %79 : vector<4xf32>
      %84 = vector.shape_cast %83 : vector<4xf32> to vector<1x4xf32>
      %85 = vector.transfer_read %35[%c0, %c6, %c0, %c0], %cst_0 {masked = [false]} : memref<1x7x1x3xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>, vector<3xf32>
      %86 = vector.extract %85[0] : vector<3xf32>
      %87 = vector.broadcast %86 : f32 to vector<4xf32>
      %88 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
      %89 = mulf %87, %37 : vector<4xf32>
      %90 = addf %89, %88 : vector<4xf32>
      %91 = vector.extract %85[1] : vector<3xf32>
      %92 = vector.broadcast %91 : f32 to vector<4xf32>
      %93 = mulf %92, %38 : vector<4xf32>
      %94 = addf %93, %90 : vector<4xf32>
      %95 = vector.extract %85[2] : vector<3xf32>
      %96 = vector.broadcast %95 : f32 to vector<4xf32>
      %97 = mulf %96, %39 : vector<4xf32>
      %98 = addf %97, %94 : vector<4xf32>
      %99 = vector.shape_cast %98 : vector<4xf32> to vector<1x4xf32>
      scf.yield %54, %69, %84, %99 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    scf.yield %34#0, %34#1, %34#2, %34#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
  }
  %30 = vector.shape_cast %29#3 : vector<1x4xf32> to vector<4xf32>
  vector.transfer_write %30, %16[%c0, %c3, %c0, %c0] {masked = [false]} : vector<4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %31 = vector.shape_cast %29#2 : vector<1x4xf32> to vector<4xf32>
  vector.transfer_write %31, %16[%c0, %c2, %c0, %c0] {masked = [false]} : vector<4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %32 = vector.shape_cast %29#1 : vector<1x4xf32> to vector<4xf32>
  vector.transfer_write %32, %16[%c0, %c1, %c0, %c0] {masked = [false]} : vector<4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  %33 = vector.shape_cast %29#0 : vector<1x4xf32> to vector<4xf32>
  vector.transfer_write %33, %16[%c0, %c0, %c0, %c0] {masked = [false]} : vector<4xf32>, memref<1x4x1x4xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 401408 + s0 + d1 * 3584 + d2 * 32 + d3)>>
  return
}

// *** IR Dump After LegalizeStandardForSPIRV ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<1x1x1x4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %c16 = constant 16 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_z, %c4 : index
    %4 = muli %workgroup_id_y, %c4 : index
    %5 = muli %workgroup_id_x, %c16 : index
    %6 = muli %workgroup_id_z, %c8 : index
    %7 = muli %workgroup_id_y, %c8 : index
    %8 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %10 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %11 = muli %10, %c4 : index
    %12 = muli %8, %c4 : index
    %13 = muli %10, %c8 : index
    %14 = muli %9, %c2 : index
    %15 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<1x4xf32>
    %16 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<1x4xf32>
    %17 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<1x4xf32>
    %18 = vector.shape_cast %cst : vector<1x1x1x4xf32> to vector<1x4xf32>
    %19:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %15, %arg2 = %16, %arg3 = %17, %arg4 = %18) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %39:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %40 = addi %5, %12 : index
        %41 = vector.transfer_read %1[%arg0, %arg5, %c0, %40], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
        %42 = addi %5, %12 : index
        %43 = vector.transfer_read %1[%arg0, %arg5, %c1, %42], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
        %44 = addi %5, %12 : index
        %45 = vector.transfer_read %1[%arg0, %arg5, %c2, %44], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
        %46 = addi %13, %arg0 : index
        %47 = addi %14, %arg5 : index
        %48 = addi %6, %46 : index
        %49 = addi %7, %47 : index
        %50 = vector.transfer_read %0[%c0, %48, %49, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %51 = vector.extract %50[0] : vector<3xf32>
        %52 = vector.broadcast %51 : f32 to vector<4xf32>
        %53 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
        %54 = mulf %52, %41 : vector<4xf32>
        %55 = addf %54, %53 : vector<4xf32>
        %56 = vector.extract %50[1] : vector<3xf32>
        %57 = vector.broadcast %56 : f32 to vector<4xf32>
        %58 = mulf %57, %43 : vector<4xf32>
        %59 = addf %58, %55 : vector<4xf32>
        %60 = vector.extract %50[2] : vector<3xf32>
        %61 = vector.broadcast %60 : f32 to vector<4xf32>
        %62 = mulf %61, %45 : vector<4xf32>
        %63 = addf %62, %59 : vector<4xf32>
        %64 = vector.shape_cast %63 : vector<4xf32> to vector<1x4xf32>
        %65 = addi %arg0, %c2 : index
        %66 = addi %13, %65 : index
        %67 = addi %14, %arg5 : index
        %68 = addi %6, %66 : index
        %69 = addi %7, %67 : index
        %70 = vector.transfer_read %0[%c0, %68, %69, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %71 = vector.extract %70[0] : vector<3xf32>
        %72 = vector.broadcast %71 : f32 to vector<4xf32>
        %73 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %74 = mulf %72, %41 : vector<4xf32>
        %75 = addf %74, %73 : vector<4xf32>
        %76 = vector.extract %70[1] : vector<3xf32>
        %77 = vector.broadcast %76 : f32 to vector<4xf32>
        %78 = mulf %77, %43 : vector<4xf32>
        %79 = addf %78, %75 : vector<4xf32>
        %80 = vector.extract %70[2] : vector<3xf32>
        %81 = vector.broadcast %80 : f32 to vector<4xf32>
        %82 = mulf %81, %45 : vector<4xf32>
        %83 = addf %82, %79 : vector<4xf32>
        %84 = vector.shape_cast %83 : vector<4xf32> to vector<1x4xf32>
        %85 = addi %arg0, %c4 : index
        %86 = addi %13, %85 : index
        %87 = addi %14, %arg5 : index
        %88 = addi %6, %86 : index
        %89 = addi %7, %87 : index
        %90 = vector.transfer_read %0[%c0, %88, %89, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %91 = vector.extract %90[0] : vector<3xf32>
        %92 = vector.broadcast %91 : f32 to vector<4xf32>
        %93 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %94 = mulf %92, %41 : vector<4xf32>
        %95 = addf %94, %93 : vector<4xf32>
        %96 = vector.extract %90[1] : vector<3xf32>
        %97 = vector.broadcast %96 : f32 to vector<4xf32>
        %98 = mulf %97, %43 : vector<4xf32>
        %99 = addf %98, %95 : vector<4xf32>
        %100 = vector.extract %90[2] : vector<3xf32>
        %101 = vector.broadcast %100 : f32 to vector<4xf32>
        %102 = mulf %101, %45 : vector<4xf32>
        %103 = addf %102, %99 : vector<4xf32>
        %104 = vector.shape_cast %103 : vector<4xf32> to vector<1x4xf32>
        %105 = addi %arg0, %c6 : index
        %106 = addi %13, %105 : index
        %107 = addi %14, %arg5 : index
        %108 = addi %6, %106 : index
        %109 = addi %7, %107 : index
        %110 = vector.transfer_read %0[%c0, %108, %109, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %111 = vector.extract %110[0] : vector<3xf32>
        %112 = vector.broadcast %111 : f32 to vector<4xf32>
        %113 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %114 = mulf %112, %41 : vector<4xf32>
        %115 = addf %114, %113 : vector<4xf32>
        %116 = vector.extract %110[1] : vector<3xf32>
        %117 = vector.broadcast %116 : f32 to vector<4xf32>
        %118 = mulf %117, %43 : vector<4xf32>
        %119 = addf %118, %115 : vector<4xf32>
        %120 = vector.extract %110[2] : vector<3xf32>
        %121 = vector.broadcast %120 : f32 to vector<4xf32>
        %122 = mulf %121, %45 : vector<4xf32>
        %123 = addf %122, %119 : vector<4xf32>
        %124 = vector.shape_cast %123 : vector<4xf32> to vector<1x4xf32>
        scf.yield %64, %84, %104, %124 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %39#0, %39#1, %39#2, %39#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    %20 = vector.shape_cast %19#3 : vector<1x4xf32> to vector<4xf32>
    %21 = addi %11, %c3 : index
    %22 = addi %3, %21 : index
    %23 = addi %4, %9 : index
    %24 = addi %5, %12 : index
    vector.transfer_write %20, %2[%c0, %22, %23, %24] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    %25 = vector.shape_cast %19#2 : vector<1x4xf32> to vector<4xf32>
    %26 = addi %11, %c2 : index
    %27 = addi %3, %26 : index
    %28 = addi %4, %9 : index
    %29 = addi %5, %12 : index
    vector.transfer_write %25, %2[%c0, %27, %28, %29] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    %30 = vector.shape_cast %19#1 : vector<1x4xf32> to vector<4xf32>
    %31 = addi %11, %c1 : index
    %32 = addi %3, %31 : index
    %33 = addi %4, %9 : index
    %34 = addi %5, %12 : index
    vector.transfer_write %30, %2[%c0, %32, %33, %34] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    %35 = vector.shape_cast %19#0 : vector<1x4xf32> to vector<4xf32>
    %36 = addi %3, %11 : index
    %37 = addi %4, %9 : index
    %38 = addi %5, %12 : index
    vector.transfer_write %35, %2[%c0, %36, %37, %38] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<1x4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %c16 = constant 16 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_z, %c4 : index
    %4 = muli %workgroup_id_y, %c4 : index
    %5 = muli %workgroup_id_x, %c16 : index
    %6 = muli %workgroup_id_z, %c8 : index
    %7 = muli %workgroup_id_y, %c8 : index
    %8 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %10 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %11 = muli %10, %c4 : index
    %12 = muli %8, %c4 : index
    %13 = muli %10, %c8 : index
    %14 = muli %9, %c2 : index
    %15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %35:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %36 = addi %5, %12 : index
        %37 = vector.transfer_read %1[%arg0, %arg5, %c0, %36], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
        %38 = addi %5, %12 : index
        %39 = vector.transfer_read %1[%arg0, %arg5, %c1, %38], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
        %40 = addi %5, %12 : index
        %41 = vector.transfer_read %1[%arg0, %arg5, %c2, %40], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
        %42 = addi %13, %arg0 : index
        %43 = addi %14, %arg5 : index
        %44 = addi %6, %42 : index
        %45 = addi %7, %43 : index
        %46 = vector.transfer_read %0[%c0, %44, %45, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %47 = vector.extract %46[0] : vector<3xf32>
        %48 = vector.broadcast %47 : f32 to vector<4xf32>
        %49 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
        %50 = mulf %48, %37 : vector<4xf32>
        %51 = addf %50, %49 : vector<4xf32>
        %52 = vector.extract %46[1] : vector<3xf32>
        %53 = vector.broadcast %52 : f32 to vector<4xf32>
        %54 = mulf %53, %39 : vector<4xf32>
        %55 = addf %54, %51 : vector<4xf32>
        %56 = vector.extract %46[2] : vector<3xf32>
        %57 = vector.broadcast %56 : f32 to vector<4xf32>
        %58 = mulf %57, %41 : vector<4xf32>
        %59 = addf %58, %55 : vector<4xf32>
        %60 = vector.shape_cast %59 : vector<4xf32> to vector<1x4xf32>
        %61 = addi %arg0, %c2 : index
        %62 = addi %13, %61 : index
        %63 = addi %14, %arg5 : index
        %64 = addi %6, %62 : index
        %65 = addi %7, %63 : index
        %66 = vector.transfer_read %0[%c0, %64, %65, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %67 = vector.extract %66[0] : vector<3xf32>
        %68 = vector.broadcast %67 : f32 to vector<4xf32>
        %69 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %70 = mulf %68, %37 : vector<4xf32>
        %71 = addf %70, %69 : vector<4xf32>
        %72 = vector.extract %66[1] : vector<3xf32>
        %73 = vector.broadcast %72 : f32 to vector<4xf32>
        %74 = mulf %73, %39 : vector<4xf32>
        %75 = addf %74, %71 : vector<4xf32>
        %76 = vector.extract %66[2] : vector<3xf32>
        %77 = vector.broadcast %76 : f32 to vector<4xf32>
        %78 = mulf %77, %41 : vector<4xf32>
        %79 = addf %78, %75 : vector<4xf32>
        %80 = vector.shape_cast %79 : vector<4xf32> to vector<1x4xf32>
        %81 = addi %arg0, %c4 : index
        %82 = addi %13, %81 : index
        %83 = addi %14, %arg5 : index
        %84 = addi %6, %82 : index
        %85 = addi %7, %83 : index
        %86 = vector.transfer_read %0[%c0, %84, %85, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %87 = vector.extract %86[0] : vector<3xf32>
        %88 = vector.broadcast %87 : f32 to vector<4xf32>
        %89 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %90 = mulf %88, %37 : vector<4xf32>
        %91 = addf %90, %89 : vector<4xf32>
        %92 = vector.extract %86[1] : vector<3xf32>
        %93 = vector.broadcast %92 : f32 to vector<4xf32>
        %94 = mulf %93, %39 : vector<4xf32>
        %95 = addf %94, %91 : vector<4xf32>
        %96 = vector.extract %86[2] : vector<3xf32>
        %97 = vector.broadcast %96 : f32 to vector<4xf32>
        %98 = mulf %97, %41 : vector<4xf32>
        %99 = addf %98, %95 : vector<4xf32>
        %100 = vector.shape_cast %99 : vector<4xf32> to vector<1x4xf32>
        %101 = addi %arg0, %c6 : index
        %102 = addi %13, %101 : index
        %103 = addi %14, %arg5 : index
        %104 = addi %6, %102 : index
        %105 = addi %7, %103 : index
        %106 = vector.transfer_read %0[%c0, %104, %105, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %107 = vector.extract %106[0] : vector<3xf32>
        %108 = vector.broadcast %107 : f32 to vector<4xf32>
        %109 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %110 = mulf %108, %37 : vector<4xf32>
        %111 = addf %110, %109 : vector<4xf32>
        %112 = vector.extract %106[1] : vector<3xf32>
        %113 = vector.broadcast %112 : f32 to vector<4xf32>
        %114 = mulf %113, %39 : vector<4xf32>
        %115 = addf %114, %111 : vector<4xf32>
        %116 = vector.extract %106[2] : vector<3xf32>
        %117 = vector.broadcast %116 : f32 to vector<4xf32>
        %118 = mulf %117, %41 : vector<4xf32>
        %119 = addf %118, %115 : vector<4xf32>
        %120 = vector.shape_cast %119 : vector<4xf32> to vector<1x4xf32>
        scf.yield %60, %80, %100, %120 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %35#0, %35#1, %35#2, %35#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    %16 = vector.shape_cast %15#3 : vector<1x4xf32> to vector<4xf32>
    %17 = addi %11, %c3 : index
    %18 = addi %3, %17 : index
    %19 = addi %4, %9 : index
    %20 = addi %5, %12 : index
    vector.transfer_write %16, %2[%c0, %18, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    %21 = vector.shape_cast %15#2 : vector<1x4xf32> to vector<4xf32>
    %22 = addi %11, %c2 : index
    %23 = addi %3, %22 : index
    %24 = addi %4, %9 : index
    %25 = addi %5, %12 : index
    vector.transfer_write %21, %2[%c0, %23, %24, %25] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    %26 = vector.shape_cast %15#1 : vector<1x4xf32> to vector<4xf32>
    %27 = addi %11, %c1 : index
    %28 = addi %3, %27 : index
    %29 = addi %4, %9 : index
    %30 = addi %5, %12 : index
    vector.transfer_write %26, %2[%c0, %28, %29, %30] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    %31 = vector.shape_cast %15#0 : vector<1x4xf32> to vector<4xf32>
    %32 = addi %3, %11 : index
    %33 = addi %4, %9 : index
    %34 = addi %5, %12 : index
    vector.transfer_write %31, %2[%c0, %32, %33, %34] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<1x4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %c16 = constant 16 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x32xf32>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x32xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_z, %c4 : index
    %4 = muli %workgroup_id_y, %c4 : index
    %5 = muli %workgroup_id_x, %c16 : index
    %6 = muli %workgroup_id_z, %c8 : index
    %7 = muli %workgroup_id_y, %c8 : index
    %8 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %10 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %11 = muli %10, %c4 : index
    %12 = muli %8, %c4 : index
    %13 = muli %10, %c8 : index
    %14 = muli %9, %c2 : index
    %15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %29:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %30 = addi %5, %12 : index
        %31 = vector.transfer_read %1[%arg0, %arg5, %c0, %30], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
        %32 = vector.transfer_read %1[%arg0, %arg5, %c1, %30], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
        %33 = vector.transfer_read %1[%arg0, %arg5, %c2, %30], %cst_0 {masked = [false]} : memref<3x3x3x32xf32>, vector<4xf32>
        %34 = addi %13, %arg0 : index
        %35 = addi %14, %arg5 : index
        %36 = addi %6, %34 : index
        %37 = addi %7, %35 : index
        %38 = vector.transfer_read %0[%c0, %36, %37, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %39 = vector.extract %38[0] : vector<3xf32>
        %40 = vector.broadcast %39 : f32 to vector<4xf32>
        %41 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
        %42 = mulf %40, %31 : vector<4xf32>
        %43 = addf %42, %41 : vector<4xf32>
        %44 = vector.extract %38[1] : vector<3xf32>
        %45 = vector.broadcast %44 : f32 to vector<4xf32>
        %46 = mulf %45, %32 : vector<4xf32>
        %47 = addf %46, %43 : vector<4xf32>
        %48 = vector.extract %38[2] : vector<3xf32>
        %49 = vector.broadcast %48 : f32 to vector<4xf32>
        %50 = mulf %49, %33 : vector<4xf32>
        %51 = addf %50, %47 : vector<4xf32>
        %52 = vector.shape_cast %51 : vector<4xf32> to vector<1x4xf32>
        %53 = addi %arg0, %c2 : index
        %54 = addi %13, %53 : index
        %55 = addi %6, %54 : index
        %56 = vector.transfer_read %0[%c0, %55, %37, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %57 = vector.extract %56[0] : vector<3xf32>
        %58 = vector.broadcast %57 : f32 to vector<4xf32>
        %59 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %60 = mulf %58, %31 : vector<4xf32>
        %61 = addf %60, %59 : vector<4xf32>
        %62 = vector.extract %56[1] : vector<3xf32>
        %63 = vector.broadcast %62 : f32 to vector<4xf32>
        %64 = mulf %63, %32 : vector<4xf32>
        %65 = addf %64, %61 : vector<4xf32>
        %66 = vector.extract %56[2] : vector<3xf32>
        %67 = vector.broadcast %66 : f32 to vector<4xf32>
        %68 = mulf %67, %33 : vector<4xf32>
        %69 = addf %68, %65 : vector<4xf32>
        %70 = vector.shape_cast %69 : vector<4xf32> to vector<1x4xf32>
        %71 = addi %arg0, %c4 : index
        %72 = addi %13, %71 : index
        %73 = addi %6, %72 : index
        %74 = vector.transfer_read %0[%c0, %73, %37, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %75 = vector.extract %74[0] : vector<3xf32>
        %76 = vector.broadcast %75 : f32 to vector<4xf32>
        %77 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %78 = mulf %76, %31 : vector<4xf32>
        %79 = addf %78, %77 : vector<4xf32>
        %80 = vector.extract %74[1] : vector<3xf32>
        %81 = vector.broadcast %80 : f32 to vector<4xf32>
        %82 = mulf %81, %32 : vector<4xf32>
        %83 = addf %82, %79 : vector<4xf32>
        %84 = vector.extract %74[2] : vector<3xf32>
        %85 = vector.broadcast %84 : f32 to vector<4xf32>
        %86 = mulf %85, %33 : vector<4xf32>
        %87 = addf %86, %83 : vector<4xf32>
        %88 = vector.shape_cast %87 : vector<4xf32> to vector<1x4xf32>
        %89 = addi %arg0, %c6 : index
        %90 = addi %13, %89 : index
        %91 = addi %6, %90 : index
        %92 = vector.transfer_read %0[%c0, %91, %37, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %93 = vector.extract %92[0] : vector<3xf32>
        %94 = vector.broadcast %93 : f32 to vector<4xf32>
        %95 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %96 = mulf %94, %31 : vector<4xf32>
        %97 = addf %96, %95 : vector<4xf32>
        %98 = vector.extract %92[1] : vector<3xf32>
        %99 = vector.broadcast %98 : f32 to vector<4xf32>
        %100 = mulf %99, %32 : vector<4xf32>
        %101 = addf %100, %97 : vector<4xf32>
        %102 = vector.extract %92[2] : vector<3xf32>
        %103 = vector.broadcast %102 : f32 to vector<4xf32>
        %104 = mulf %103, %33 : vector<4xf32>
        %105 = addf %104, %101 : vector<4xf32>
        %106 = vector.shape_cast %105 : vector<4xf32> to vector<1x4xf32>
        scf.yield %52, %70, %88, %106 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %29#0, %29#1, %29#2, %29#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    %16 = vector.shape_cast %15#3 : vector<1x4xf32> to vector<4xf32>
    %17 = addi %11, %c3 : index
    %18 = addi %3, %17 : index
    %19 = addi %4, %9 : index
    %20 = addi %5, %12 : index
    vector.transfer_write %16, %2[%c0, %18, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    %21 = vector.shape_cast %15#2 : vector<1x4xf32> to vector<4xf32>
    %22 = addi %11, %c2 : index
    %23 = addi %3, %22 : index
    vector.transfer_write %21, %2[%c0, %23, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    %24 = vector.shape_cast %15#1 : vector<1x4xf32> to vector<4xf32>
    %25 = addi %11, %c1 : index
    %26 = addi %3, %25 : index
    vector.transfer_write %24, %2[%c0, %26, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    %27 = vector.shape_cast %15#0 : vector<1x4xf32> to vector<4xf32>
    %28 = addi %3, %11 : index
    vector.transfer_write %27, %2[%c0, %28, %19, %20] {masked = [false]} : vector<4xf32>, memref<1x112x112x32xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::VectorizeMemRefPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<1x4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %c16 = constant 16 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x8xvector<4xf32>>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x8xvector<4xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_z, %c4 : index
    %4 = muli %workgroup_id_y, %c4 : index
    %5 = muli %workgroup_id_x, %c16 : index
    %6 = muli %workgroup_id_z, %c8 : index
    %7 = muli %workgroup_id_y, %c8 : index
    %8 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %10 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %11 = muli %10, %c4 : index
    %12 = muli %8, %c4 : index
    %13 = muli %10, %c8 : index
    %14 = muli %9, %c2 : index
    %15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
      %41:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>) {
        %42 = addi %5, %12 : index
        %c4_5 = constant 4 : index
        %43 = divi_signed %42, %c4_5 : index
        %44 = memref.load %1[%arg0, %arg5, %c0, %43] : memref<3x3x3x8xvector<4xf32>>
        %45 = vector.bitcast %44 : vector<4xf32> to vector<4xf32>
        %46 = vector.shape_cast %45 : vector<4xf32> to vector<4xf32>
        %c4_6 = constant 4 : index
        %47 = divi_signed %42, %c4_6 : index
        %48 = memref.load %1[%arg0, %arg5, %c1, %47] : memref<3x3x3x8xvector<4xf32>>
        %49 = vector.bitcast %48 : vector<4xf32> to vector<4xf32>
        %50 = vector.shape_cast %49 : vector<4xf32> to vector<4xf32>
        %c4_7 = constant 4 : index
        %51 = divi_signed %42, %c4_7 : index
        %52 = memref.load %1[%arg0, %arg5, %c2, %51] : memref<3x3x3x8xvector<4xf32>>
        %53 = vector.bitcast %52 : vector<4xf32> to vector<4xf32>
        %54 = vector.shape_cast %53 : vector<4xf32> to vector<4xf32>
        %55 = addi %13, %arg0 : index
        %56 = addi %14, %arg5 : index
        %57 = addi %6, %55 : index
        %58 = addi %7, %56 : index
        %59 = vector.transfer_read %0[%c0, %57, %58, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %60 = vector.extract %59[0] : vector<3xf32>
        %61 = vector.broadcast %60 : f32 to vector<4xf32>
        %62 = vector.shape_cast %arg6 : vector<1x4xf32> to vector<4xf32>
        %63 = mulf %61, %46 : vector<4xf32>
        %64 = addf %63, %62 : vector<4xf32>
        %65 = vector.extract %59[1] : vector<3xf32>
        %66 = vector.broadcast %65 : f32 to vector<4xf32>
        %67 = mulf %66, %50 : vector<4xf32>
        %68 = addf %67, %64 : vector<4xf32>
        %69 = vector.extract %59[2] : vector<3xf32>
        %70 = vector.broadcast %69 : f32 to vector<4xf32>
        %71 = mulf %70, %54 : vector<4xf32>
        %72 = addf %71, %68 : vector<4xf32>
        %73 = vector.shape_cast %72 : vector<4xf32> to vector<1x4xf32>
        %74 = addi %arg0, %c2 : index
        %75 = addi %13, %74 : index
        %76 = addi %6, %75 : index
        %77 = vector.transfer_read %0[%c0, %76, %58, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %78 = vector.extract %77[0] : vector<3xf32>
        %79 = vector.broadcast %78 : f32 to vector<4xf32>
        %80 = vector.shape_cast %arg7 : vector<1x4xf32> to vector<4xf32>
        %81 = mulf %79, %46 : vector<4xf32>
        %82 = addf %81, %80 : vector<4xf32>
        %83 = vector.extract %77[1] : vector<3xf32>
        %84 = vector.broadcast %83 : f32 to vector<4xf32>
        %85 = mulf %84, %50 : vector<4xf32>
        %86 = addf %85, %82 : vector<4xf32>
        %87 = vector.extract %77[2] : vector<3xf32>
        %88 = vector.broadcast %87 : f32 to vector<4xf32>
        %89 = mulf %88, %54 : vector<4xf32>
        %90 = addf %89, %86 : vector<4xf32>
        %91 = vector.shape_cast %90 : vector<4xf32> to vector<1x4xf32>
        %92 = addi %arg0, %c4 : index
        %93 = addi %13, %92 : index
        %94 = addi %6, %93 : index
        %95 = vector.transfer_read %0[%c0, %94, %58, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %96 = vector.extract %95[0] : vector<3xf32>
        %97 = vector.broadcast %96 : f32 to vector<4xf32>
        %98 = vector.shape_cast %arg8 : vector<1x4xf32> to vector<4xf32>
        %99 = mulf %97, %46 : vector<4xf32>
        %100 = addf %99, %98 : vector<4xf32>
        %101 = vector.extract %95[1] : vector<3xf32>
        %102 = vector.broadcast %101 : f32 to vector<4xf32>
        %103 = mulf %102, %50 : vector<4xf32>
        %104 = addf %103, %100 : vector<4xf32>
        %105 = vector.extract %95[2] : vector<3xf32>
        %106 = vector.broadcast %105 : f32 to vector<4xf32>
        %107 = mulf %106, %54 : vector<4xf32>
        %108 = addf %107, %104 : vector<4xf32>
        %109 = vector.shape_cast %108 : vector<4xf32> to vector<1x4xf32>
        %110 = addi %arg0, %c6 : index
        %111 = addi %13, %110 : index
        %112 = addi %6, %111 : index
        %113 = vector.transfer_read %0[%c0, %112, %58, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %114 = vector.extract %113[0] : vector<3xf32>
        %115 = vector.broadcast %114 : f32 to vector<4xf32>
        %116 = vector.shape_cast %arg9 : vector<1x4xf32> to vector<4xf32>
        %117 = mulf %115, %46 : vector<4xf32>
        %118 = addf %117, %116 : vector<4xf32>
        %119 = vector.extract %113[1] : vector<3xf32>
        %120 = vector.broadcast %119 : f32 to vector<4xf32>
        %121 = mulf %120, %50 : vector<4xf32>
        %122 = addf %121, %118 : vector<4xf32>
        %123 = vector.extract %113[2] : vector<3xf32>
        %124 = vector.broadcast %123 : f32 to vector<4xf32>
        %125 = mulf %124, %54 : vector<4xf32>
        %126 = addf %125, %122 : vector<4xf32>
        %127 = vector.shape_cast %126 : vector<4xf32> to vector<1x4xf32>
        scf.yield %73, %91, %109, %127 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
      }
      scf.yield %41#0, %41#1, %41#2, %41#3 : vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>, vector<1x4xf32>
    }
    %16 = vector.shape_cast %15#3 : vector<1x4xf32> to vector<4xf32>
    %17 = addi %11, %c3 : index
    %18 = addi %3, %17 : index
    %19 = addi %4, %9 : index
    %20 = addi %5, %12 : index
    %c4_1 = constant 4 : index
    %21 = divi_signed %20, %c4_1 : index
    %22 = vector.shape_cast %16 : vector<4xf32> to vector<4xf32>
    %23 = vector.bitcast %22 : vector<4xf32> to vector<4xf32>
    memref.store %23, %2[%c0, %18, %19, %21] : memref<1x112x112x8xvector<4xf32>>
    %24 = vector.shape_cast %15#2 : vector<1x4xf32> to vector<4xf32>
    %25 = addi %11, %c2 : index
    %26 = addi %3, %25 : index
    %c4_2 = constant 4 : index
    %27 = divi_signed %20, %c4_2 : index
    %28 = vector.shape_cast %24 : vector<4xf32> to vector<4xf32>
    %29 = vector.bitcast %28 : vector<4xf32> to vector<4xf32>
    memref.store %29, %2[%c0, %26, %19, %27] : memref<1x112x112x8xvector<4xf32>>
    %30 = vector.shape_cast %15#1 : vector<1x4xf32> to vector<4xf32>
    %31 = addi %11, %c1 : index
    %32 = addi %3, %31 : index
    %c4_3 = constant 4 : index
    %33 = divi_signed %20, %c4_3 : index
    %34 = vector.shape_cast %30 : vector<4xf32> to vector<4xf32>
    %35 = vector.bitcast %34 : vector<4xf32> to vector<4xf32>
    memref.store %35, %2[%c0, %32, %19, %33] : memref<1x112x112x8xvector<4xf32>>
    %36 = vector.shape_cast %15#0 : vector<1x4xf32> to vector<4xf32>
    %37 = addi %3, %11 : index
    %c4_4 = constant 4 : index
    %38 = divi_signed %20, %c4_4 : index
    %39 = vector.shape_cast %36 : vector<4xf32> to vector<4xf32>
    %40 = vector.bitcast %39 : vector<4xf32> to vector<4xf32>
    memref.store %40, %2[%c0, %37, %19, %38] : memref<1x112x112x8xvector<4xf32>>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ForOpCanonicalizationPass ***
func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
  %cst = constant dense<0.000000e+00> : vector<1x4xf32>
  %c4 = constant 4 : index
  %c2 = constant 2 : index
  %c6 = constant 6 : index
  %cst_0 = constant 0.000000e+00 : f32
  %c1 = constant 1 : index
  %c3 = constant 3 : index
  %c0 = constant 0 : index
  %c16 = constant 16 : index
  %c8 = constant 8 : index
  %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
  %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x8xvector<4xf32>>
  %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x8xvector<4xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_z = hal.interface.workgroup.id[2] : index
  %3 = muli %workgroup_id_z, %c4 : index
  %4 = muli %workgroup_id_y, %c4 : index
  %5 = muli %workgroup_id_x, %c16 : index
  %6 = muli %workgroup_id_z, %c8 : index
  %7 = muli %workgroup_id_y, %c8 : index
  %8 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %9 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %10 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %11 = muli %10, %c4 : index
  %12 = muli %8, %c4 : index
  %13 = muli %10, %c8 : index
  %14 = muli %9, %c2 : index
  %15 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
  %16 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
  %17 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
  %18 = vector.shape_cast %cst : vector<1x4xf32> to vector<4xf32>
  %19:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %15, %arg2 = %16, %arg3 = %17, %arg4 = %18) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
    %33:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
      %34 = addi %5, %12 : index
      %35 = divi_signed %34, %c4 : index
      %36 = memref.load %1[%arg0, %arg5, %c0, %35] : memref<3x3x3x8xvector<4xf32>>
      %37 = divi_signed %34, %c4 : index
      %38 = memref.load %1[%arg0, %arg5, %c1, %37] : memref<3x3x3x8xvector<4xf32>>
      %39 = divi_signed %34, %c4 : index
      %40 = memref.load %1[%arg0, %arg5, %c2, %39] : memref<3x3x3x8xvector<4xf32>>
      %41 = addi %13, %arg0 : index
      %42 = addi %14, %arg5 : index
      %43 = addi %6, %41 : index
      %44 = addi %7, %42 : index
      %45 = vector.transfer_read %0[%c0, %43, %44, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
      %46 = vector.extract %45[0] : vector<3xf32>
      %47 = vector.broadcast %46 : f32 to vector<4xf32>
      %48 = mulf %47, %36 : vector<4xf32>
      %49 = addf %48, %arg6 : vector<4xf32>
      %50 = vector.extract %45[1] : vector<3xf32>
      %51 = vector.broadcast %50 : f32 to vector<4xf32>
      %52 = mulf %51, %38 : vector<4xf32>
      %53 = addf %52, %49 : vector<4xf32>
      %54 = vector.extract %45[2] : vector<3xf32>
      %55 = vector.broadcast %54 : f32 to vector<4xf32>
      %56 = mulf %55, %40 : vector<4xf32>
      %57 = addf %56, %53 : vector<4xf32>
      %58 = addi %arg0, %c2 : index
      %59 = addi %13, %58 : index
      %60 = addi %6, %59 : index
      %61 = vector.transfer_read %0[%c0, %60, %44, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
      %62 = vector.extract %61[0] : vector<3xf32>
      %63 = vector.broadcast %62 : f32 to vector<4xf32>
      %64 = mulf %63, %36 : vector<4xf32>
      %65 = addf %64, %arg7 : vector<4xf32>
      %66 = vector.extract %61[1] : vector<3xf32>
      %67 = vector.broadcast %66 : f32 to vector<4xf32>
      %68 = mulf %67, %38 : vector<4xf32>
      %69 = addf %68, %65 : vector<4xf32>
      %70 = vector.extract %61[2] : vector<3xf32>
      %71 = vector.broadcast %70 : f32 to vector<4xf32>
      %72 = mulf %71, %40 : vector<4xf32>
      %73 = addf %72, %69 : vector<4xf32>
      %74 = addi %arg0, %c4 : index
      %75 = addi %13, %74 : index
      %76 = addi %6, %75 : index
      %77 = vector.transfer_read %0[%c0, %76, %44, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
      %78 = vector.extract %77[0] : vector<3xf32>
      %79 = vector.broadcast %78 : f32 to vector<4xf32>
      %80 = mulf %79, %36 : vector<4xf32>
      %81 = addf %80, %arg8 : vector<4xf32>
      %82 = vector.extract %77[1] : vector<3xf32>
      %83 = vector.broadcast %82 : f32 to vector<4xf32>
      %84 = mulf %83, %38 : vector<4xf32>
      %85 = addf %84, %81 : vector<4xf32>
      %86 = vector.extract %77[2] : vector<3xf32>
      %87 = vector.broadcast %86 : f32 to vector<4xf32>
      %88 = mulf %87, %40 : vector<4xf32>
      %89 = addf %88, %85 : vector<4xf32>
      %90 = addi %arg0, %c6 : index
      %91 = addi %13, %90 : index
      %92 = addi %6, %91 : index
      %93 = vector.transfer_read %0[%c0, %92, %44, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
      %94 = vector.extract %93[0] : vector<3xf32>
      %95 = vector.broadcast %94 : f32 to vector<4xf32>
      %96 = mulf %95, %36 : vector<4xf32>
      %97 = addf %96, %arg9 : vector<4xf32>
      %98 = vector.extract %93[1] : vector<3xf32>
      %99 = vector.broadcast %98 : f32 to vector<4xf32>
      %100 = mulf %99, %38 : vector<4xf32>
      %101 = addf %100, %97 : vector<4xf32>
      %102 = vector.extract %93[2] : vector<3xf32>
      %103 = vector.broadcast %102 : f32 to vector<4xf32>
      %104 = mulf %103, %40 : vector<4xf32>
      %105 = addf %104, %101 : vector<4xf32>
      scf.yield %57, %73, %89, %105 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
    }
    scf.yield %33#0, %33#1, %33#2, %33#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
  }
  %20 = addi %11, %c3 : index
  %21 = addi %3, %20 : index
  %22 = addi %4, %9 : index
  %23 = addi %5, %12 : index
  %24 = divi_signed %23, %c4 : index
  memref.store %19#3, %2[%c0, %21, %22, %24] : memref<1x112x112x8xvector<4xf32>>
  %25 = addi %11, %c2 : index
  %26 = addi %3, %25 : index
  %27 = divi_signed %23, %c4 : index
  memref.store %19#2, %2[%c0, %26, %22, %27] : memref<1x112x112x8xvector<4xf32>>
  %28 = addi %11, %c1 : index
  %29 = addi %3, %28 : index
  %30 = divi_signed %23, %c4 : index
  memref.store %19#1, %2[%c0, %29, %22, %30] : memref<1x112x112x8xvector<4xf32>>
  %31 = addi %3, %11 : index
  %32 = divi_signed %23, %c4 : index
  memref.store %19#0, %2[%c0, %31, %22, %32] : memref<1x112x112x8xvector<4xf32>>
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %c16 = constant 16 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x8xvector<4xf32>>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x8xvector<4xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_z, %c4 : index
    %4 = muli %workgroup_id_y, %c4 : index
    %5 = muli %workgroup_id_x, %c16 : index
    %6 = muli %workgroup_id_z, %c8 : index
    %7 = muli %workgroup_id_y, %c8 : index
    %8 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %10 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %11 = muli %10, %c4 : index
    %12 = muli %8, %c4 : index
    %13 = muli %10, %c8 : index
    %14 = muli %9, %c2 : index
    %15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
      %29:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
        %30 = addi %5, %12 : index
        %31 = divi_signed %30, %c4 : index
        %32 = memref.load %1[%arg0, %arg5, %c0, %31] : memref<3x3x3x8xvector<4xf32>>
        %33 = divi_signed %30, %c4 : index
        %34 = memref.load %1[%arg0, %arg5, %c1, %33] : memref<3x3x3x8xvector<4xf32>>
        %35 = divi_signed %30, %c4 : index
        %36 = memref.load %1[%arg0, %arg5, %c2, %35] : memref<3x3x3x8xvector<4xf32>>
        %37 = addi %13, %arg0 : index
        %38 = addi %14, %arg5 : index
        %39 = addi %6, %37 : index
        %40 = addi %7, %38 : index
        %41 = vector.transfer_read %0[%c0, %39, %40, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %42 = vector.extract %41[0] : vector<3xf32>
        %43 = vector.broadcast %42 : f32 to vector<4xf32>
        %44 = mulf %43, %32 : vector<4xf32>
        %45 = addf %44, %arg6 : vector<4xf32>
        %46 = vector.extract %41[1] : vector<3xf32>
        %47 = vector.broadcast %46 : f32 to vector<4xf32>
        %48 = mulf %47, %34 : vector<4xf32>
        %49 = addf %48, %45 : vector<4xf32>
        %50 = vector.extract %41[2] : vector<3xf32>
        %51 = vector.broadcast %50 : f32 to vector<4xf32>
        %52 = mulf %51, %36 : vector<4xf32>
        %53 = addf %52, %49 : vector<4xf32>
        %54 = addi %arg0, %c2 : index
        %55 = addi %13, %54 : index
        %56 = addi %6, %55 : index
        %57 = vector.transfer_read %0[%c0, %56, %40, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %58 = vector.extract %57[0] : vector<3xf32>
        %59 = vector.broadcast %58 : f32 to vector<4xf32>
        %60 = mulf %59, %32 : vector<4xf32>
        %61 = addf %60, %arg7 : vector<4xf32>
        %62 = vector.extract %57[1] : vector<3xf32>
        %63 = vector.broadcast %62 : f32 to vector<4xf32>
        %64 = mulf %63, %34 : vector<4xf32>
        %65 = addf %64, %61 : vector<4xf32>
        %66 = vector.extract %57[2] : vector<3xf32>
        %67 = vector.broadcast %66 : f32 to vector<4xf32>
        %68 = mulf %67, %36 : vector<4xf32>
        %69 = addf %68, %65 : vector<4xf32>
        %70 = addi %arg0, %c4 : index
        %71 = addi %13, %70 : index
        %72 = addi %6, %71 : index
        %73 = vector.transfer_read %0[%c0, %72, %40, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %74 = vector.extract %73[0] : vector<3xf32>
        %75 = vector.broadcast %74 : f32 to vector<4xf32>
        %76 = mulf %75, %32 : vector<4xf32>
        %77 = addf %76, %arg8 : vector<4xf32>
        %78 = vector.extract %73[1] : vector<3xf32>
        %79 = vector.broadcast %78 : f32 to vector<4xf32>
        %80 = mulf %79, %34 : vector<4xf32>
        %81 = addf %80, %77 : vector<4xf32>
        %82 = vector.extract %73[2] : vector<3xf32>
        %83 = vector.broadcast %82 : f32 to vector<4xf32>
        %84 = mulf %83, %36 : vector<4xf32>
        %85 = addf %84, %81 : vector<4xf32>
        %86 = addi %arg0, %c6 : index
        %87 = addi %13, %86 : index
        %88 = addi %6, %87 : index
        %89 = vector.transfer_read %0[%c0, %88, %40, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %90 = vector.extract %89[0] : vector<3xf32>
        %91 = vector.broadcast %90 : f32 to vector<4xf32>
        %92 = mulf %91, %32 : vector<4xf32>
        %93 = addf %92, %arg9 : vector<4xf32>
        %94 = vector.extract %89[1] : vector<3xf32>
        %95 = vector.broadcast %94 : f32 to vector<4xf32>
        %96 = mulf %95, %34 : vector<4xf32>
        %97 = addf %96, %93 : vector<4xf32>
        %98 = vector.extract %89[2] : vector<3xf32>
        %99 = vector.broadcast %98 : f32 to vector<4xf32>
        %100 = mulf %99, %36 : vector<4xf32>
        %101 = addf %100, %97 : vector<4xf32>
        scf.yield %53, %69, %85, %101 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
      }
      scf.yield %29#0, %29#1, %29#2, %29#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
    }
    %16 = addi %11, %c3 : index
    %17 = addi %3, %16 : index
    %18 = addi %4, %9 : index
    %19 = addi %5, %12 : index
    %20 = divi_signed %19, %c4 : index
    memref.store %15#3, %2[%c0, %17, %18, %20] : memref<1x112x112x8xvector<4xf32>>
    %21 = addi %11, %c2 : index
    %22 = addi %3, %21 : index
    %23 = divi_signed %19, %c4 : index
    memref.store %15#2, %2[%c0, %22, %18, %23] : memref<1x112x112x8xvector<4xf32>>
    %24 = addi %11, %c1 : index
    %25 = addi %3, %24 : index
    %26 = divi_signed %19, %c4 : index
    memref.store %15#1, %2[%c0, %25, %18, %26] : memref<1x112x112x8xvector<4xf32>>
    %27 = addi %3, %11 : index
    %28 = divi_signed %19, %c4 : index
    memref.store %15#0, %2[%c0, %27, %18, %28] : memref<1x112x112x8xvector<4xf32>>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  func @conv_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
    %cst = constant dense<0.000000e+00> : vector<4xf32>
    %c4 = constant 4 : index
    %c2 = constant 2 : index
    %c6 = constant 6 : index
    %cst_0 = constant 0.000000e+00 : f32
    %c1 = constant 1 : index
    %c3 = constant 3 : index
    %c0 = constant 0 : index
    %c16 = constant 16 : index
    %c8 = constant 8 : index
    %0 = hal.interface.binding.subspan @legacy_io::@ro0[%c0] : memref<1x225x225x3xf32>
    %1 = hal.interface.binding.subspan @legacy_io::@ro1[%c0] : memref<3x3x3x8xvector<4xf32>>
    %2 = hal.interface.binding.subspan @legacy_io::@wo2[%c0] : memref<1x112x112x8xvector<4xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_id_z = hal.interface.workgroup.id[2] : index
    %3 = muli %workgroup_id_z, %c4 : index
    %4 = muli %workgroup_id_y, %c4 : index
    %5 = muli %workgroup_id_x, %c16 : index
    %6 = muli %workgroup_id_z, %c8 : index
    %7 = muli %workgroup_id_y, %c8 : index
    %8 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %9 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %10 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %11 = muli %10, %c4 : index
    %12 = muli %8, %c4 : index
    %13 = muli %10, %c8 : index
    %14 = muli %9, %c2 : index
    %15:4 = scf.for %arg0 = %c0 to %c3 step %c1 iter_args(%arg1 = %cst, %arg2 = %cst, %arg3 = %cst, %arg4 = %cst) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
      %26:4 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg1, %arg7 = %arg2, %arg8 = %arg3, %arg9 = %arg4) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
        %27 = addi %5, %12 : index
        %28 = divi_signed %27, %c4 : index
        %29 = memref.load %1[%arg0, %arg5, %c0, %28] : memref<3x3x3x8xvector<4xf32>>
        %30 = memref.load %1[%arg0, %arg5, %c1, %28] : memref<3x3x3x8xvector<4xf32>>
        %31 = memref.load %1[%arg0, %arg5, %c2, %28] : memref<3x3x3x8xvector<4xf32>>
        %32 = addi %13, %arg0 : index
        %33 = addi %14, %arg5 : index
        %34 = addi %6, %32 : index
        %35 = addi %7, %33 : index
        %36 = vector.transfer_read %0[%c0, %34, %35, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %37 = vector.extract %36[0] : vector<3xf32>
        %38 = vector.broadcast %37 : f32 to vector<4xf32>
        %39 = mulf %38, %29 : vector<4xf32>
        %40 = addf %39, %arg6 : vector<4xf32>
        %41 = vector.extract %36[1] : vector<3xf32>
        %42 = vector.broadcast %41 : f32 to vector<4xf32>
        %43 = mulf %42, %30 : vector<4xf32>
        %44 = addf %43, %40 : vector<4xf32>
        %45 = vector.extract %36[2] : vector<3xf32>
        %46 = vector.broadcast %45 : f32 to vector<4xf32>
        %47 = mulf %46, %31 : vector<4xf32>
        %48 = addf %47, %44 : vector<4xf32>
        %49 = addi %arg0, %c2 : index
        %50 = addi %13, %49 : index
        %51 = addi %6, %50 : index
        %52 = vector.transfer_read %0[%c0, %51, %35, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %53 = vector.extract %52[0] : vector<3xf32>
        %54 = vector.broadcast %53 : f32 to vector<4xf32>
        %55 = mulf %54, %29 : vector<4xf32>
        %56 = addf %55, %arg7 : vector<4xf32>
        %57 = vector.extract %52[1] : vector<3xf32>
        %58 = vector.broadcast %57 : f32 to vector<4xf32>
        %59 = mulf %58, %30 : vector<4xf32>
        %60 = addf %59, %56 : vector<4xf32>
        %61 = vector.extract %52[2] : vector<3xf32>
        %62 = vector.broadcast %61 : f32 to vector<4xf32>
        %63 = mulf %62, %31 : vector<4xf32>
        %64 = addf %63, %60 : vector<4xf32>
        %65 = addi %arg0, %c4 : index
        %66 = addi %13, %65 : index
        %67 = addi %6, %66 : index
        %68 = vector.transfer_read %0[%c0, %67, %35, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %69 = vector.extract %68[0] : vector<3xf32>
        %70 = vector.broadcast %69 : f32 to vector<4xf32>
        %71 = mulf %70, %29 : vector<4xf32>
        %72 = addf %71, %arg8 : vector<4xf32>
        %73 = vector.extract %68[1] : vector<3xf32>
        %74 = vector.broadcast %73 : f32 to vector<4xf32>
        %75 = mulf %74, %30 : vector<4xf32>
        %76 = addf %75, %72 : vector<4xf32>
        %77 = vector.extract %68[2] : vector<3xf32>
        %78 = vector.broadcast %77 : f32 to vector<4xf32>
        %79 = mulf %78, %31 : vector<4xf32>
        %80 = addf %79, %76 : vector<4xf32>
        %81 = addi %arg0, %c6 : index
        %82 = addi %13, %81 : index
        %83 = addi %6, %82 : index
        %84 = vector.transfer_read %0[%c0, %83, %35, %c0], %cst_0 {masked = [false]} : memref<1x225x225x3xf32>, vector<3xf32>
        %85 = vector.extract %84[0] : vector<3xf32>
        %86 = vector.broadcast %85 : f32 to vector<4xf32>
        %87 = mulf %86, %29 : vector<4xf32>
        %88 = addf %87, %arg9 : vector<4xf32>
        %89 = vector.extract %84[1] : vector<3xf32>
        %90 = vector.broadcast %89 : f32 to vector<4xf32>
        %91 = mulf %90, %30 : vector<4xf32>
        %92 = addf %91, %88 : vector<4xf32>
        %93 = vector.extract %84[2] : vector<3xf32>
        %94 = vector.broadcast %93 : f32 to vector<4xf32>
        %95 = mulf %94, %31 : vector<4xf32>
        %96 = addf %95, %92 : vector<4xf32>
        scf.yield %48, %64, %80, %96 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
      }
      scf.yield %26#0, %26#1, %26#2, %26#3 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
    }
    %16 = addi %11, %c3 : index
    %17 = addi %3, %16 : index
    %18 = addi %4, %9 : index
    %19 = addi %5, %12 : index
    %20 = divi_signed %19, %c4 : index
    memref.store %15#3, %2[%c0, %17, %18, %20] : memref<1x112x112x8xvector<4xf32>>
    %21 = addi %11, %c2 : index
    %22 = addi %3, %21 : index
    memref.store %15#2, %2[%c0, %22, %18, %20] : memref<1x112x112x8xvector<4xf32>>
    %23 = addi %11, %c1 : index
    %24 = addi %3, %23 : index
    memref.store %15#1, %2[%c0, %24, %18, %20] : memref<1x112x112x8xvector<4xf32>>
    %25 = addi %3, %11 : index
    memref.store %15#0, %2[%c0, %25, %18, %20] : memref<1x112x112x8xvector<4xf32>>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::(anonymous namespace)::ConvertToSPIRVPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
  spv.module Logical GLSL450 {
    spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
    spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
    spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
    spv.func @conv_dispatch_0() "None" attributes {spv.entry_point_abi = {local_size = dense<[4, 4, 1]> : vector<3xi32>}} {
      %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
      %1 = spv.Constant 4 : i32
      %2 = spv.Constant 2 : i32
      %3 = spv.Constant 6 : i32
      %4 = spv.Constant 0.000000e+00 : f32
      %5 = spv.Constant 1 : i32
      %6 = spv.Constant 3 : i32
      %7 = spv.Constant 0 : i32
      %8 = spv.Constant 16 : i32
      %9 = spv.Constant 8 : i32
      %10 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
      %11 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
      %12 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
      %13 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %14 = spv.Load "Input" %13 : vector<3xi32>
      %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
      %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %17 = spv.Load "Input" %16 : vector<3xi32>
      %18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
      %19 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %20 = spv.Load "Input" %19 : vector<3xi32>
      %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
      %22 = spv.IMul %21, %1 : i32
      %23 = spv.IMul %18, %1 : i32
      %24 = spv.IMul %15, %8 : i32
      %25 = spv.IMul %21, %9 : i32
      %26 = spv.IMul %18, %9 : i32
      %27 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %28 = spv.Load "Input" %27 : vector<3xi32>
      %29 = spv.CompositeExtract %28[0 : i32] : vector<3xi32>
      %30 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %31 = spv.Load "Input" %30 : vector<3xi32>
      %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
      %33 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %34 = spv.Load "Input" %33 : vector<3xi32>
      %35 = spv.CompositeExtract %34[2 : i32] : vector<3xi32>
      %36 = spv.IMul %35, %1 : i32
      %37 = spv.IMul %29, %1 : i32
      %38 = spv.IMul %35, %9 : i32
      %39 = spv.IMul %32, %2 : i32
      %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %43 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      spv.mlir.loop {
        spv.Branch ^bb1(%7, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb1(%118: i32, %119: vector<4xf32>, %120: vector<4xf32>, %121: vector<4xf32>, %122: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
        %123 = spv.SLessThan %118, %6 : i32
        spv.BranchConditional %123, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %124 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %125 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %126 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %127 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        spv.mlir.loop {
          spv.Branch ^bb1(%7, %119, %120, %121, %122 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb1(%133: i32, %134: vector<4xf32>, %135: vector<4xf32>, %136: vector<4xf32>, %137: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
          %138 = spv.SLessThan %133, %6 : i32
          spv.BranchConditional %138, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %139 = spv.IAdd %24, %37 : i32
          %140 = spv.SDiv %139, %1 : i32
          %141 = spv.Constant 0 : i32
          %142 = spv.Constant 0 : i32
          %143 = spv.Constant 72 : i32
          %144 = spv.IMul %143, %118 : i32
          %145 = spv.IAdd %142, %144 : i32
          %146 = spv.Constant 24 : i32
          %147 = spv.IMul %146, %133 : i32
          %148 = spv.IAdd %145, %147 : i32
          %149 = spv.Constant 8 : i32
          %150 = spv.IMul %149, %7 : i32
          %151 = spv.IAdd %148, %150 : i32
          %152 = spv.Constant 1 : i32
          %153 = spv.IMul %152, %140 : i32
          %154 = spv.IAdd %151, %153 : i32
          %155 = spv.AccessChain %11[%141, %154] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %156 = spv.Load "StorageBuffer" %155 : vector<4xf32>
          %157 = spv.Constant 0 : i32
          %158 = spv.Constant 0 : i32
          %159 = spv.Constant 72 : i32
          %160 = spv.IMul %159, %118 : i32
          %161 = spv.IAdd %158, %160 : i32
          %162 = spv.Constant 24 : i32
          %163 = spv.IMul %162, %133 : i32
          %164 = spv.IAdd %161, %163 : i32
          %165 = spv.Constant 8 : i32
          %166 = spv.IMul %165, %5 : i32
          %167 = spv.IAdd %164, %166 : i32
          %168 = spv.Constant 1 : i32
          %169 = spv.IMul %168, %140 : i32
          %170 = spv.IAdd %167, %169 : i32
          %171 = spv.AccessChain %11[%157, %170] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %172 = spv.Load "StorageBuffer" %171 : vector<4xf32>
          %173 = spv.Constant 0 : i32
          %174 = spv.Constant 0 : i32
          %175 = spv.Constant 72 : i32
          %176 = spv.IMul %175, %118 : i32
          %177 = spv.IAdd %174, %176 : i32
          %178 = spv.Constant 24 : i32
          %179 = spv.IMul %178, %133 : i32
          %180 = spv.IAdd %177, %179 : i32
          %181 = spv.Constant 8 : i32
          %182 = spv.IMul %181, %2 : i32
          %183 = spv.IAdd %180, %182 : i32
          %184 = spv.Constant 1 : i32
          %185 = spv.IMul %184, %140 : i32
          %186 = spv.IAdd %183, %185 : i32
          %187 = spv.AccessChain %11[%173, %186] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
          %188 = spv.Load "StorageBuffer" %187 : vector<4xf32>
          %189 = spv.IAdd %38, %118 : i32
          %190 = spv.IAdd %39, %133 : i32
          %191 = spv.IAdd %25, %189 : i32
          %192 = spv.IAdd %26, %190 : i32
          %193 = spv.Constant 0 : i32
          %194 = spv.Constant 0 : i32
          %195 = spv.Constant 0 : i32
          %196 = spv.Constant 151875 : i32
          %197 = spv.IMul %196, %7 : i32
          %198 = spv.IAdd %195, %197 : i32
          %199 = spv.Constant 675 : i32
          %200 = spv.IMul %199, %191 : i32
          %201 = spv.IAdd %198, %200 : i32
          %202 = spv.Constant 3 : i32
          %203 = spv.IMul %202, %192 : i32
          %204 = spv.IAdd %201, %203 : i32
          %205 = spv.Constant 1 : i32
          %206 = spv.IMul %205, %193 : i32
          %207 = spv.IAdd %204, %206 : i32
          %208 = spv.AccessChain %10[%194, %207] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %209 = spv.Load "StorageBuffer" %208 : f32
          %210 = spv.Constant 1 : i32
          %211 = spv.Constant 0 : i32
          %212 = spv.Constant 0 : i32
          %213 = spv.Constant 151875 : i32
          %214 = spv.IMul %213, %7 : i32
          %215 = spv.IAdd %212, %214 : i32
          %216 = spv.Constant 675 : i32
          %217 = spv.IMul %216, %191 : i32
          %218 = spv.IAdd %215, %217 : i32
          %219 = spv.Constant 3 : i32
          %220 = spv.IMul %219, %192 : i32
          %221 = spv.IAdd %218, %220 : i32
          %222 = spv.Constant 1 : i32
          %223 = spv.IMul %222, %210 : i32
          %224 = spv.IAdd %221, %223 : i32
          %225 = spv.AccessChain %10[%211, %224] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %226 = spv.Load "StorageBuffer" %225 : f32
          %227 = spv.Constant 2 : i32
          %228 = spv.Constant 0 : i32
          %229 = spv.Constant 0 : i32
          %230 = spv.Constant 151875 : i32
          %231 = spv.IMul %230, %7 : i32
          %232 = spv.IAdd %229, %231 : i32
          %233 = spv.Constant 675 : i32
          %234 = spv.IMul %233, %191 : i32
          %235 = spv.IAdd %232, %234 : i32
          %236 = spv.Constant 3 : i32
          %237 = spv.IMul %236, %192 : i32
          %238 = spv.IAdd %235, %237 : i32
          %239 = spv.Constant 1 : i32
          %240 = spv.IMul %239, %227 : i32
          %241 = spv.IAdd %238, %240 : i32
          %242 = spv.AccessChain %10[%228, %241] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %243 = spv.Load "StorageBuffer" %242 : f32
          %244 = spv.CompositeConstruct %209, %226, %243 : vector<3xf32>
          %245 = spv.CompositeExtract %244[0 : i32] : vector<3xf32>
          %246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
          %247 = spv.FMul %246, %156 : vector<4xf32>
          %248 = spv.FAdd %247, %134 : vector<4xf32>
          %249 = spv.CompositeExtract %244[1 : i32] : vector<3xf32>
          %250 = spv.CompositeConstruct %249, %249, %249, %249 : vector<4xf32>
          %251 = spv.FMul %250, %172 : vector<4xf32>
          %252 = spv.FAdd %251, %248 : vector<4xf32>
          %253 = spv.CompositeExtract %244[2 : i32] : vector<3xf32>
          %254 = spv.CompositeConstruct %253, %253, %253, %253 : vector<4xf32>
          %255 = spv.FMul %254, %188 : vector<4xf32>
          %256 = spv.FAdd %255, %252 : vector<4xf32>
          %257 = spv.IAdd %118, %2 : i32
          %258 = spv.IAdd %38, %257 : i32
          %259 = spv.IAdd %25, %258 : i32
          %260 = spv.Constant 0 : i32
          %261 = spv.Constant 0 : i32
          %262 = spv.Constant 0 : i32
          %263 = spv.Constant 151875 : i32
          %264 = spv.IMul %263, %7 : i32
          %265 = spv.IAdd %262, %264 : i32
          %266 = spv.Constant 675 : i32
          %267 = spv.IMul %266, %259 : i32
          %268 = spv.IAdd %265, %267 : i32
          %269 = spv.Constant 3 : i32
          %270 = spv.IMul %269, %192 : i32
          %271 = spv.IAdd %268, %270 : i32
          %272 = spv.Constant 1 : i32
          %273 = spv.IMul %272, %260 : i32
          %274 = spv.IAdd %271, %273 : i32
          %275 = spv.AccessChain %10[%261, %274] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %276 = spv.Load "StorageBuffer" %275 : f32
          %277 = spv.Constant 1 : i32
          %278 = spv.Constant 0 : i32
          %279 = spv.Constant 0 : i32
          %280 = spv.Constant 151875 : i32
          %281 = spv.IMul %280, %7 : i32
          %282 = spv.IAdd %279, %281 : i32
          %283 = spv.Constant 675 : i32
          %284 = spv.IMul %283, %259 : i32
          %285 = spv.IAdd %282, %284 : i32
          %286 = spv.Constant 3 : i32
          %287 = spv.IMul %286, %192 : i32
          %288 = spv.IAdd %285, %287 : i32
          %289 = spv.Constant 1 : i32
          %290 = spv.IMul %289, %277 : i32
          %291 = spv.IAdd %288, %290 : i32
          %292 = spv.AccessChain %10[%278, %291] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %293 = spv.Load "StorageBuffer" %292 : f32
          %294 = spv.Constant 2 : i32
          %295 = spv.Constant 0 : i32
          %296 = spv.Constant 0 : i32
          %297 = spv.Constant 151875 : i32
          %298 = spv.IMul %297, %7 : i32
          %299 = spv.IAdd %296, %298 : i32
          %300 = spv.Constant 675 : i32
          %301 = spv.IMul %300, %259 : i32
          %302 = spv.IAdd %299, %301 : i32
          %303 = spv.Constant 3 : i32
          %304 = spv.IMul %303, %192 : i32
          %305 = spv.IAdd %302, %304 : i32
          %306 = spv.Constant 1 : i32
          %307 = spv.IMul %306, %294 : i32
          %308 = spv.IAdd %305, %307 : i32
          %309 = spv.AccessChain %10[%295, %308] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %310 = spv.Load "StorageBuffer" %309 : f32
          %311 = spv.CompositeConstruct %276, %293, %310 : vector<3xf32>
          %312 = spv.CompositeExtract %311[0 : i32] : vector<3xf32>
          %313 = spv.CompositeConstruct %312, %312, %312, %312 : vector<4xf32>
          %314 = spv.FMul %313, %156 : vector<4xf32>
          %315 = spv.FAdd %314, %135 : vector<4xf32>
          %316 = spv.CompositeExtract %311[1 : i32] : vector<3xf32>
          %317 = spv.CompositeConstruct %316, %316, %316, %316 : vector<4xf32>
          %318 = spv.FMul %317, %172 : vector<4xf32>
          %319 = spv.FAdd %318, %315 : vector<4xf32>
          %320 = spv.CompositeExtract %311[2 : i32] : vector<3xf32>
          %321 = spv.CompositeConstruct %320, %320, %320, %320 : vector<4xf32>
          %322 = spv.FMul %321, %188 : vector<4xf32>
          %323 = spv.FAdd %322, %319 : vector<4xf32>
          %324 = spv.IAdd %118, %1 : i32
          %325 = spv.IAdd %38, %324 : i32
          %326 = spv.IAdd %25, %325 : i32
          %327 = spv.Constant 0 : i32
          %328 = spv.Constant 0 : i32
          %329 = spv.Constant 0 : i32
          %330 = spv.Constant 151875 : i32
          %331 = spv.IMul %330, %7 : i32
          %332 = spv.IAdd %329, %331 : i32
          %333 = spv.Constant 675 : i32
          %334 = spv.IMul %333, %326 : i32
          %335 = spv.IAdd %332, %334 : i32
          %336 = spv.Constant 3 : i32
          %337 = spv.IMul %336, %192 : i32
          %338 = spv.IAdd %335, %337 : i32
          %339 = spv.Constant 1 : i32
          %340 = spv.IMul %339, %327 : i32
          %341 = spv.IAdd %338, %340 : i32
          %342 = spv.AccessChain %10[%328, %341] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %343 = spv.Load "StorageBuffer" %342 : f32
          %344 = spv.Constant 1 : i32
          %345 = spv.Constant 0 : i32
          %346 = spv.Constant 0 : i32
          %347 = spv.Constant 151875 : i32
          %348 = spv.IMul %347, %7 : i32
          %349 = spv.IAdd %346, %348 : i32
          %350 = spv.Constant 675 : i32
          %351 = spv.IMul %350, %326 : i32
          %352 = spv.IAdd %349, %351 : i32
          %353 = spv.Constant 3 : i32
          %354 = spv.IMul %353, %192 : i32
          %355 = spv.IAdd %352, %354 : i32
          %356 = spv.Constant 1 : i32
          %357 = spv.IMul %356, %344 : i32
          %358 = spv.IAdd %355, %357 : i32
          %359 = spv.AccessChain %10[%345, %358] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %360 = spv.Load "StorageBuffer" %359 : f32
          %361 = spv.Constant 2 : i32
          %362 = spv.Constant 0 : i32
          %363 = spv.Constant 0 : i32
          %364 = spv.Constant 151875 : i32
          %365 = spv.IMul %364, %7 : i32
          %366 = spv.IAdd %363, %365 : i32
          %367 = spv.Constant 675 : i32
          %368 = spv.IMul %367, %326 : i32
          %369 = spv.IAdd %366, %368 : i32
          %370 = spv.Constant 3 : i32
          %371 = spv.IMul %370, %192 : i32
          %372 = spv.IAdd %369, %371 : i32
          %373 = spv.Constant 1 : i32
          %374 = spv.IMul %373, %361 : i32
          %375 = spv.IAdd %372, %374 : i32
          %376 = spv.AccessChain %10[%362, %375] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %377 = spv.Load "StorageBuffer" %376 : f32
          %378 = spv.CompositeConstruct %343, %360, %377 : vector<3xf32>
          %379 = spv.CompositeExtract %378[0 : i32] : vector<3xf32>
          %380 = spv.CompositeConstruct %379, %379, %379, %379 : vector<4xf32>
          %381 = spv.FMul %380, %156 : vector<4xf32>
          %382 = spv.FAdd %381, %136 : vector<4xf32>
          %383 = spv.CompositeExtract %378[1 : i32] : vector<3xf32>
          %384 = spv.CompositeConstruct %383, %383, %383, %383 : vector<4xf32>
          %385 = spv.FMul %384, %172 : vector<4xf32>
          %386 = spv.FAdd %385, %382 : vector<4xf32>
          %387 = spv.CompositeExtract %378[2 : i32] : vector<3xf32>
          %388 = spv.CompositeConstruct %387, %387, %387, %387 : vector<4xf32>
          %389 = spv.FMul %388, %188 : vector<4xf32>
          %390 = spv.FAdd %389, %386 : vector<4xf32>
          %391 = spv.IAdd %118, %3 : i32
          %392 = spv.IAdd %38, %391 : i32
          %393 = spv.IAdd %25, %392 : i32
          %394 = spv.Constant 0 : i32
          %395 = spv.Constant 0 : i32
          %396 = spv.Constant 0 : i32
          %397 = spv.Constant 151875 : i32
          %398 = spv.IMul %397, %7 : i32
          %399 = spv.IAdd %396, %398 : i32
          %400 = spv.Constant 675 : i32
          %401 = spv.IMul %400, %393 : i32
          %402 = spv.IAdd %399, %401 : i32
          %403 = spv.Constant 3 : i32
          %404 = spv.IMul %403, %192 : i32
          %405 = spv.IAdd %402, %404 : i32
          %406 = spv.Constant 1 : i32
          %407 = spv.IMul %406, %394 : i32
          %408 = spv.IAdd %405, %407 : i32
          %409 = spv.AccessChain %10[%395, %408] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %410 = spv.Load "StorageBuffer" %409 : f32
          %411 = spv.Constant 1 : i32
          %412 = spv.Constant 0 : i32
          %413 = spv.Constant 0 : i32
          %414 = spv.Constant 151875 : i32
          %415 = spv.IMul %414, %7 : i32
          %416 = spv.IAdd %413, %415 : i32
          %417 = spv.Constant 675 : i32
          %418 = spv.IMul %417, %393 : i32
          %419 = spv.IAdd %416, %418 : i32
          %420 = spv.Constant 3 : i32
          %421 = spv.IMul %420, %192 : i32
          %422 = spv.IAdd %419, %421 : i32
          %423 = spv.Constant 1 : i32
          %424 = spv.IMul %423, %411 : i32
          %425 = spv.IAdd %422, %424 : i32
          %426 = spv.AccessChain %10[%412, %425] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %427 = spv.Load "StorageBuffer" %426 : f32
          %428 = spv.Constant 2 : i32
          %429 = spv.Constant 0 : i32
          %430 = spv.Constant 0 : i32
          %431 = spv.Constant 151875 : i32
          %432 = spv.IMul %431, %7 : i32
          %433 = spv.IAdd %430, %432 : i32
          %434 = spv.Constant 675 : i32
          %435 = spv.IMul %434, %393 : i32
          %436 = spv.IAdd %433, %435 : i32
          %437 = spv.Constant 3 : i32
          %438 = spv.IMul %437, %192 : i32
          %439 = spv.IAdd %436, %438 : i32
          %440 = spv.Constant 1 : i32
          %441 = spv.IMul %440, %428 : i32
          %442 = spv.IAdd %439, %441 : i32
          %443 = spv.AccessChain %10[%429, %442] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
          %444 = spv.Load "StorageBuffer" %443 : f32
          %445 = spv.CompositeConstruct %410, %427, %444 : vector<3xf32>
          %446 = spv.CompositeExtract %445[0 : i32] : vector<3xf32>
          %447 = spv.CompositeConstruct %446, %446, %446, %446 : vector<4xf32>
          %448 = spv.FMul %447, %156 : vector<4xf32>
          %449 = spv.FAdd %448, %137 : vector<4xf32>
          %450 = spv.CompositeExtract %445[1 : i32] : vector<3xf32>
          %451 = spv.CompositeConstruct %450, %450, %450, %450 : vector<4xf32>
          %452 = spv.FMul %451, %172 : vector<4xf32>
          %453 = spv.FAdd %452, %449 : vector<4xf32>
          %454 = spv.CompositeExtract %445[2 : i32] : vector<3xf32>
          %455 = spv.CompositeConstruct %454, %454, %454, %454 : vector<4xf32>
          %456 = spv.FMul %455, %188 : vector<4xf32>
          %457 = spv.FAdd %456, %453 : vector<4xf32>
          spv.Store "Function" %124, %256 : vector<4xf32>
          spv.Store "Function" %125, %323 : vector<4xf32>
          spv.Store "Function" %126, %390 : vector<4xf32>
          spv.Store "Function" %127, %457 : vector<4xf32>
          %458 = spv.IAdd %133, %5 : i32
          spv.Branch ^bb1(%458, %256, %323, %390, %457 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb3:  // pred: ^bb1
          spv.mlir.merge
        }
        %128 = spv.Load "Function" %127 : vector<4xf32>
        %129 = spv.Load "Function" %126 : vector<4xf32>
        %130 = spv.Load "Function" %125 : vector<4xf32>
        %131 = spv.Load "Function" %124 : vector<4xf32>
        spv.Store "Function" %40, %131 : vector<4xf32>
        spv.Store "Function" %41, %130 : vector<4xf32>
        spv.Store "Function" %42, %129 : vector<4xf32>
        spv.Store "Function" %43, %128 : vector<4xf32>
        %132 = spv.IAdd %118, %5 : i32
        spv.Branch ^bb1(%132, %131, %130, %129, %128 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      %44 = spv.Load "Function" %43 : vector<4xf32>
      %45 = spv.Load "Function" %42 : vector<4xf32>
      %46 = spv.Load "Function" %41 : vector<4xf32>
      %47 = spv.Load "Function" %40 : vector<4xf32>
      %48 = spv.IAdd %36, %6 : i32
      %49 = spv.IAdd %22, %48 : i32
      %50 = spv.IAdd %23, %32 : i32
      %51 = spv.IAdd %24, %37 : i32
      %52 = spv.SDiv %51, %1 : i32
      %53 = spv.Constant 0 : i32
      %54 = spv.Constant 0 : i32
      %55 = spv.Constant 100352 : i32
      %56 = spv.IMul %55, %7 : i32
      %57 = spv.IAdd %54, %56 : i32
      %58 = spv.Constant 896 : i32
      %59 = spv.IMul %58, %49 : i32
      %60 = spv.IAdd %57, %59 : i32
      %61 = spv.Constant 8 : i32
      %62 = spv.IMul %61, %50 : i32
      %63 = spv.IAdd %60, %62 : i32
      %64 = spv.Constant 1 : i32
      %65 = spv.IMul %64, %52 : i32
      %66 = spv.IAdd %63, %65 : i32
      %67 = spv.AccessChain %12[%53, %66] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %67, %44 : vector<4xf32>
      %68 = spv.IAdd %36, %2 : i32
      %69 = spv.IAdd %22, %68 : i32
      %70 = spv.Constant 0 : i32
      %71 = spv.Constant 0 : i32
      %72 = spv.Constant 100352 : i32
      %73 = spv.IMul %72, %7 : i32
      %74 = spv.IAdd %71, %73 : i32
      %75 = spv.Constant 896 : i32
      %76 = spv.IMul %75, %69 : i32
      %77 = spv.IAdd %74, %76 : i32
      %78 = spv.Constant 8 : i32
      %79 = spv.IMul %78, %50 : i32
      %80 = spv.IAdd %77, %79 : i32
      %81 = spv.Constant 1 : i32
      %82 = spv.IMul %81, %52 : i32
      %83 = spv.IAdd %80, %82 : i32
      %84 = spv.AccessChain %12[%70, %83] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %84, %45 : vector<4xf32>
      %85 = spv.IAdd %36, %5 : i32
      %86 = spv.IAdd %22, %85 : i32
      %87 = spv.Constant 0 : i32
      %88 = spv.Constant 0 : i32
      %89 = spv.Constant 100352 : i32
      %90 = spv.IMul %89, %7 : i32
      %91 = spv.IAdd %88, %90 : i32
      %92 = spv.Constant 896 : i32
      %93 = spv.IMul %92, %86 : i32
      %94 = spv.IAdd %91, %93 : i32
      %95 = spv.Constant 8 : i32
      %96 = spv.IMul %95, %50 : i32
      %97 = spv.IAdd %94, %96 : i32
      %98 = spv.Constant 1 : i32
      %99 = spv.IMul %98, %52 : i32
      %100 = spv.IAdd %97, %99 : i32
      %101 = spv.AccessChain %12[%87, %100] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %101, %46 : vector<4xf32>
      %102 = spv.IAdd %22, %36 : i32
      %103 = spv.Constant 0 : i32
      %104 = spv.Constant 0 : i32
      %105 = spv.Constant 100352 : i32
      %106 = spv.IMul %105, %7 : i32
      %107 = spv.IAdd %104, %106 : i32
      %108 = spv.Constant 896 : i32
      %109 = spv.IMul %108, %102 : i32
      %110 = spv.IAdd %107, %109 : i32
      %111 = spv.Constant 8 : i32
      %112 = spv.IMul %111, %50 : i32
      %113 = spv.IAdd %110, %112 : i32
      %114 = spv.Constant 1 : i32
      %115 = spv.IMul %114, %52 : i32
      %116 = spv.IAdd %113, %115 : i32
      %117 = spv.AccessChain %12[%103, %116] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
      spv.Store "StorageBuffer" %117, %47 : vector<4xf32>
      spv.Return
    }
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After SPIRVLowerABIAttributes ***
spv.module Logical GLSL450 {
  spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
  spv.func @conv_dispatch_0() "None" {
    %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
    %1 = spv.Constant 4 : i32
    %2 = spv.Constant 2 : i32
    %3 = spv.Constant 6 : i32
    %4 = spv.Constant 0.000000e+00 : f32
    %5 = spv.Constant 1 : i32
    %6 = spv.Constant 3 : i32
    %7 = spv.Constant 0 : i32
    %8 = spv.Constant 16 : i32
    %9 = spv.Constant 8 : i32
    %10 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
    %11 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %12 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %13 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
    %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %17 = spv.Load "Input" %16 : vector<3xi32>
    %18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
    %19 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %20 = spv.Load "Input" %19 : vector<3xi32>
    %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
    %22 = spv.IMul %21, %1 : i32
    %23 = spv.IMul %18, %1 : i32
    %24 = spv.IMul %15, %8 : i32
    %25 = spv.IMul %21, %9 : i32
    %26 = spv.IMul %18, %9 : i32
    %27 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %28 = spv.Load "Input" %27 : vector<3xi32>
    %29 = spv.CompositeExtract %28[0 : i32] : vector<3xi32>
    %30 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %31 = spv.Load "Input" %30 : vector<3xi32>
    %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
    %33 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %34 = spv.Load "Input" %33 : vector<3xi32>
    %35 = spv.CompositeExtract %34[2 : i32] : vector<3xi32>
    %36 = spv.IMul %35, %1 : i32
    %37 = spv.IMul %29, %1 : i32
    %38 = spv.IMul %35, %9 : i32
    %39 = spv.IMul %32, %2 : i32
    %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %43 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    spv.mlir.loop {
      spv.Branch ^bb1(%7, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
    ^bb1(%118: i32, %119: vector<4xf32>, %120: vector<4xf32>, %121: vector<4xf32>, %122: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
      %123 = spv.SLessThan %118, %6 : i32
      spv.BranchConditional %123, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %124 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %125 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %126 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %127 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      spv.mlir.loop {
        spv.Branch ^bb1(%7, %119, %120, %121, %122 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb1(%133: i32, %134: vector<4xf32>, %135: vector<4xf32>, %136: vector<4xf32>, %137: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
        %138 = spv.SLessThan %133, %6 : i32
        spv.BranchConditional %138, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %139 = spv.IAdd %24, %37 : i32
        %140 = spv.SDiv %139, %1 : i32
        %141 = spv.Constant 0 : i32
        %142 = spv.Constant 0 : i32
        %143 = spv.Constant 72 : i32
        %144 = spv.IMul %143, %118 : i32
        %145 = spv.IAdd %142, %144 : i32
        %146 = spv.Constant 24 : i32
        %147 = spv.IMul %146, %133 : i32
        %148 = spv.IAdd %145, %147 : i32
        %149 = spv.Constant 8 : i32
        %150 = spv.IMul %149, %7 : i32
        %151 = spv.IAdd %148, %150 : i32
        %152 = spv.Constant 1 : i32
        %153 = spv.IMul %152, %140 : i32
        %154 = spv.IAdd %151, %153 : i32
        %155 = spv.AccessChain %11[%141, %154] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %156 = spv.Load "StorageBuffer" %155 : vector<4xf32>
        %157 = spv.Constant 0 : i32
        %158 = spv.Constant 0 : i32
        %159 = spv.Constant 72 : i32
        %160 = spv.IMul %159, %118 : i32
        %161 = spv.IAdd %158, %160 : i32
        %162 = spv.Constant 24 : i32
        %163 = spv.IMul %162, %133 : i32
        %164 = spv.IAdd %161, %163 : i32
        %165 = spv.Constant 8 : i32
        %166 = spv.IMul %165, %5 : i32
        %167 = spv.IAdd %164, %166 : i32
        %168 = spv.Constant 1 : i32
        %169 = spv.IMul %168, %140 : i32
        %170 = spv.IAdd %167, %169 : i32
        %171 = spv.AccessChain %11[%157, %170] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %172 = spv.Load "StorageBuffer" %171 : vector<4xf32>
        %173 = spv.Constant 0 : i32
        %174 = spv.Constant 0 : i32
        %175 = spv.Constant 72 : i32
        %176 = spv.IMul %175, %118 : i32
        %177 = spv.IAdd %174, %176 : i32
        %178 = spv.Constant 24 : i32
        %179 = spv.IMul %178, %133 : i32
        %180 = spv.IAdd %177, %179 : i32
        %181 = spv.Constant 8 : i32
        %182 = spv.IMul %181, %2 : i32
        %183 = spv.IAdd %180, %182 : i32
        %184 = spv.Constant 1 : i32
        %185 = spv.IMul %184, %140 : i32
        %186 = spv.IAdd %183, %185 : i32
        %187 = spv.AccessChain %11[%173, %186] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %188 = spv.Load "StorageBuffer" %187 : vector<4xf32>
        %189 = spv.IAdd %38, %118 : i32
        %190 = spv.IAdd %39, %133 : i32
        %191 = spv.IAdd %25, %189 : i32
        %192 = spv.IAdd %26, %190 : i32
        %193 = spv.Constant 0 : i32
        %194 = spv.Constant 0 : i32
        %195 = spv.Constant 0 : i32
        %196 = spv.Constant 151875 : i32
        %197 = spv.IMul %196, %7 : i32
        %198 = spv.IAdd %195, %197 : i32
        %199 = spv.Constant 675 : i32
        %200 = spv.IMul %199, %191 : i32
        %201 = spv.IAdd %198, %200 : i32
        %202 = spv.Constant 3 : i32
        %203 = spv.IMul %202, %192 : i32
        %204 = spv.IAdd %201, %203 : i32
        %205 = spv.Constant 1 : i32
        %206 = spv.IMul %205, %193 : i32
        %207 = spv.IAdd %204, %206 : i32
        %208 = spv.AccessChain %10[%194, %207] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %209 = spv.Load "StorageBuffer" %208 : f32
        %210 = spv.Constant 1 : i32
        %211 = spv.Constant 0 : i32
        %212 = spv.Constant 0 : i32
        %213 = spv.Constant 151875 : i32
        %214 = spv.IMul %213, %7 : i32
        %215 = spv.IAdd %212, %214 : i32
        %216 = spv.Constant 675 : i32
        %217 = spv.IMul %216, %191 : i32
        %218 = spv.IAdd %215, %217 : i32
        %219 = spv.Constant 3 : i32
        %220 = spv.IMul %219, %192 : i32
        %221 = spv.IAdd %218, %220 : i32
        %222 = spv.Constant 1 : i32
        %223 = spv.IMul %222, %210 : i32
        %224 = spv.IAdd %221, %223 : i32
        %225 = spv.AccessChain %10[%211, %224] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %226 = spv.Load "StorageBuffer" %225 : f32
        %227 = spv.Constant 2 : i32
        %228 = spv.Constant 0 : i32
        %229 = spv.Constant 0 : i32
        %230 = spv.Constant 151875 : i32
        %231 = spv.IMul %230, %7 : i32
        %232 = spv.IAdd %229, %231 : i32
        %233 = spv.Constant 675 : i32
        %234 = spv.IMul %233, %191 : i32
        %235 = spv.IAdd %232, %234 : i32
        %236 = spv.Constant 3 : i32
        %237 = spv.IMul %236, %192 : i32
        %238 = spv.IAdd %235, %237 : i32
        %239 = spv.Constant 1 : i32
        %240 = spv.IMul %239, %227 : i32
        %241 = spv.IAdd %238, %240 : i32
        %242 = spv.AccessChain %10[%228, %241] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %243 = spv.Load "StorageBuffer" %242 : f32
        %244 = spv.CompositeConstruct %209, %226, %243 : vector<3xf32>
        %245 = spv.CompositeExtract %244[0 : i32] : vector<3xf32>
        %246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
        %247 = spv.FMul %246, %156 : vector<4xf32>
        %248 = spv.FAdd %247, %134 : vector<4xf32>
        %249 = spv.CompositeExtract %244[1 : i32] : vector<3xf32>
        %250 = spv.CompositeConstruct %249, %249, %249, %249 : vector<4xf32>
        %251 = spv.FMul %250, %172 : vector<4xf32>
        %252 = spv.FAdd %251, %248 : vector<4xf32>
        %253 = spv.CompositeExtract %244[2 : i32] : vector<3xf32>
        %254 = spv.CompositeConstruct %253, %253, %253, %253 : vector<4xf32>
        %255 = spv.FMul %254, %188 : vector<4xf32>
        %256 = spv.FAdd %255, %252 : vector<4xf32>
        %257 = spv.IAdd %118, %2 : i32
        %258 = spv.IAdd %38, %257 : i32
        %259 = spv.IAdd %25, %258 : i32
        %260 = spv.Constant 0 : i32
        %261 = spv.Constant 0 : i32
        %262 = spv.Constant 0 : i32
        %263 = spv.Constant 151875 : i32
        %264 = spv.IMul %263, %7 : i32
        %265 = spv.IAdd %262, %264 : i32
        %266 = spv.Constant 675 : i32
        %267 = spv.IMul %266, %259 : i32
        %268 = spv.IAdd %265, %267 : i32
        %269 = spv.Constant 3 : i32
        %270 = spv.IMul %269, %192 : i32
        %271 = spv.IAdd %268, %270 : i32
        %272 = spv.Constant 1 : i32
        %273 = spv.IMul %272, %260 : i32
        %274 = spv.IAdd %271, %273 : i32
        %275 = spv.AccessChain %10[%261, %274] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %276 = spv.Load "StorageBuffer" %275 : f32
        %277 = spv.Constant 1 : i32
        %278 = spv.Constant 0 : i32
        %279 = spv.Constant 0 : i32
        %280 = spv.Constant 151875 : i32
        %281 = spv.IMul %280, %7 : i32
        %282 = spv.IAdd %279, %281 : i32
        %283 = spv.Constant 675 : i32
        %284 = spv.IMul %283, %259 : i32
        %285 = spv.IAdd %282, %284 : i32
        %286 = spv.Constant 3 : i32
        %287 = spv.IMul %286, %192 : i32
        %288 = spv.IAdd %285, %287 : i32
        %289 = spv.Constant 1 : i32
        %290 = spv.IMul %289, %277 : i32
        %291 = spv.IAdd %288, %290 : i32
        %292 = spv.AccessChain %10[%278, %291] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %293 = spv.Load "StorageBuffer" %292 : f32
        %294 = spv.Constant 2 : i32
        %295 = spv.Constant 0 : i32
        %296 = spv.Constant 0 : i32
        %297 = spv.Constant 151875 : i32
        %298 = spv.IMul %297, %7 : i32
        %299 = spv.IAdd %296, %298 : i32
        %300 = spv.Constant 675 : i32
        %301 = spv.IMul %300, %259 : i32
        %302 = spv.IAdd %299, %301 : i32
        %303 = spv.Constant 3 : i32
        %304 = spv.IMul %303, %192 : i32
        %305 = spv.IAdd %302, %304 : i32
        %306 = spv.Constant 1 : i32
        %307 = spv.IMul %306, %294 : i32
        %308 = spv.IAdd %305, %307 : i32
        %309 = spv.AccessChain %10[%295, %308] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %310 = spv.Load "StorageBuffer" %309 : f32
        %311 = spv.CompositeConstruct %276, %293, %310 : vector<3xf32>
        %312 = spv.CompositeExtract %311[0 : i32] : vector<3xf32>
        %313 = spv.CompositeConstruct %312, %312, %312, %312 : vector<4xf32>
        %314 = spv.FMul %313, %156 : vector<4xf32>
        %315 = spv.FAdd %314, %135 : vector<4xf32>
        %316 = spv.CompositeExtract %311[1 : i32] : vector<3xf32>
        %317 = spv.CompositeConstruct %316, %316, %316, %316 : vector<4xf32>
        %318 = spv.FMul %317, %172 : vector<4xf32>
        %319 = spv.FAdd %318, %315 : vector<4xf32>
        %320 = spv.CompositeExtract %311[2 : i32] : vector<3xf32>
        %321 = spv.CompositeConstruct %320, %320, %320, %320 : vector<4xf32>
        %322 = spv.FMul %321, %188 : vector<4xf32>
        %323 = spv.FAdd %322, %319 : vector<4xf32>
        %324 = spv.IAdd %118, %1 : i32
        %325 = spv.IAdd %38, %324 : i32
        %326 = spv.IAdd %25, %325 : i32
        %327 = spv.Constant 0 : i32
        %328 = spv.Constant 0 : i32
        %329 = spv.Constant 0 : i32
        %330 = spv.Constant 151875 : i32
        %331 = spv.IMul %330, %7 : i32
        %332 = spv.IAdd %329, %331 : i32
        %333 = spv.Constant 675 : i32
        %334 = spv.IMul %333, %326 : i32
        %335 = spv.IAdd %332, %334 : i32
        %336 = spv.Constant 3 : i32
        %337 = spv.IMul %336, %192 : i32
        %338 = spv.IAdd %335, %337 : i32
        %339 = spv.Constant 1 : i32
        %340 = spv.IMul %339, %327 : i32
        %341 = spv.IAdd %338, %340 : i32
        %342 = spv.AccessChain %10[%328, %341] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %343 = spv.Load "StorageBuffer" %342 : f32
        %344 = spv.Constant 1 : i32
        %345 = spv.Constant 0 : i32
        %346 = spv.Constant 0 : i32
        %347 = spv.Constant 151875 : i32
        %348 = spv.IMul %347, %7 : i32
        %349 = spv.IAdd %346, %348 : i32
        %350 = spv.Constant 675 : i32
        %351 = spv.IMul %350, %326 : i32
        %352 = spv.IAdd %349, %351 : i32
        %353 = spv.Constant 3 : i32
        %354 = spv.IMul %353, %192 : i32
        %355 = spv.IAdd %352, %354 : i32
        %356 = spv.Constant 1 : i32
        %357 = spv.IMul %356, %344 : i32
        %358 = spv.IAdd %355, %357 : i32
        %359 = spv.AccessChain %10[%345, %358] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %360 = spv.Load "StorageBuffer" %359 : f32
        %361 = spv.Constant 2 : i32
        %362 = spv.Constant 0 : i32
        %363 = spv.Constant 0 : i32
        %364 = spv.Constant 151875 : i32
        %365 = spv.IMul %364, %7 : i32
        %366 = spv.IAdd %363, %365 : i32
        %367 = spv.Constant 675 : i32
        %368 = spv.IMul %367, %326 : i32
        %369 = spv.IAdd %366, %368 : i32
        %370 = spv.Constant 3 : i32
        %371 = spv.IMul %370, %192 : i32
        %372 = spv.IAdd %369, %371 : i32
        %373 = spv.Constant 1 : i32
        %374 = spv.IMul %373, %361 : i32
        %375 = spv.IAdd %372, %374 : i32
        %376 = spv.AccessChain %10[%362, %375] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %377 = spv.Load "StorageBuffer" %376 : f32
        %378 = spv.CompositeConstruct %343, %360, %377 : vector<3xf32>
        %379 = spv.CompositeExtract %378[0 : i32] : vector<3xf32>
        %380 = spv.CompositeConstruct %379, %379, %379, %379 : vector<4xf32>
        %381 = spv.FMul %380, %156 : vector<4xf32>
        %382 = spv.FAdd %381, %136 : vector<4xf32>
        %383 = spv.CompositeExtract %378[1 : i32] : vector<3xf32>
        %384 = spv.CompositeConstruct %383, %383, %383, %383 : vector<4xf32>
        %385 = spv.FMul %384, %172 : vector<4xf32>
        %386 = spv.FAdd %385, %382 : vector<4xf32>
        %387 = spv.CompositeExtract %378[2 : i32] : vector<3xf32>
        %388 = spv.CompositeConstruct %387, %387, %387, %387 : vector<4xf32>
        %389 = spv.FMul %388, %188 : vector<4xf32>
        %390 = spv.FAdd %389, %386 : vector<4xf32>
        %391 = spv.IAdd %118, %3 : i32
        %392 = spv.IAdd %38, %391 : i32
        %393 = spv.IAdd %25, %392 : i32
        %394 = spv.Constant 0 : i32
        %395 = spv.Constant 0 : i32
        %396 = spv.Constant 0 : i32
        %397 = spv.Constant 151875 : i32
        %398 = spv.IMul %397, %7 : i32
        %399 = spv.IAdd %396, %398 : i32
        %400 = spv.Constant 675 : i32
        %401 = spv.IMul %400, %393 : i32
        %402 = spv.IAdd %399, %401 : i32
        %403 = spv.Constant 3 : i32
        %404 = spv.IMul %403, %192 : i32
        %405 = spv.IAdd %402, %404 : i32
        %406 = spv.Constant 1 : i32
        %407 = spv.IMul %406, %394 : i32
        %408 = spv.IAdd %405, %407 : i32
        %409 = spv.AccessChain %10[%395, %408] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %410 = spv.Load "StorageBuffer" %409 : f32
        %411 = spv.Constant 1 : i32
        %412 = spv.Constant 0 : i32
        %413 = spv.Constant 0 : i32
        %414 = spv.Constant 151875 : i32
        %415 = spv.IMul %414, %7 : i32
        %416 = spv.IAdd %413, %415 : i32
        %417 = spv.Constant 675 : i32
        %418 = spv.IMul %417, %393 : i32
        %419 = spv.IAdd %416, %418 : i32
        %420 = spv.Constant 3 : i32
        %421 = spv.IMul %420, %192 : i32
        %422 = spv.IAdd %419, %421 : i32
        %423 = spv.Constant 1 : i32
        %424 = spv.IMul %423, %411 : i32
        %425 = spv.IAdd %422, %424 : i32
        %426 = spv.AccessChain %10[%412, %425] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %427 = spv.Load "StorageBuffer" %426 : f32
        %428 = spv.Constant 2 : i32
        %429 = spv.Constant 0 : i32
        %430 = spv.Constant 0 : i32
        %431 = spv.Constant 151875 : i32
        %432 = spv.IMul %431, %7 : i32
        %433 = spv.IAdd %430, %432 : i32
        %434 = spv.Constant 675 : i32
        %435 = spv.IMul %434, %393 : i32
        %436 = spv.IAdd %433, %435 : i32
        %437 = spv.Constant 3 : i32
        %438 = spv.IMul %437, %192 : i32
        %439 = spv.IAdd %436, %438 : i32
        %440 = spv.Constant 1 : i32
        %441 = spv.IMul %440, %428 : i32
        %442 = spv.IAdd %439, %441 : i32
        %443 = spv.AccessChain %10[%429, %442] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %444 = spv.Load "StorageBuffer" %443 : f32
        %445 = spv.CompositeConstruct %410, %427, %444 : vector<3xf32>
        %446 = spv.CompositeExtract %445[0 : i32] : vector<3xf32>
        %447 = spv.CompositeConstruct %446, %446, %446, %446 : vector<4xf32>
        %448 = spv.FMul %447, %156 : vector<4xf32>
        %449 = spv.FAdd %448, %137 : vector<4xf32>
        %450 = spv.CompositeExtract %445[1 : i32] : vector<3xf32>
        %451 = spv.CompositeConstruct %450, %450, %450, %450 : vector<4xf32>
        %452 = spv.FMul %451, %172 : vector<4xf32>
        %453 = spv.FAdd %452, %449 : vector<4xf32>
        %454 = spv.CompositeExtract %445[2 : i32] : vector<3xf32>
        %455 = spv.CompositeConstruct %454, %454, %454, %454 : vector<4xf32>
        %456 = spv.FMul %455, %188 : vector<4xf32>
        %457 = spv.FAdd %456, %453 : vector<4xf32>
        spv.Store "Function" %124, %256 : vector<4xf32>
        spv.Store "Function" %125, %323 : vector<4xf32>
        spv.Store "Function" %126, %390 : vector<4xf32>
        spv.Store "Function" %127, %457 : vector<4xf32>
        %458 = spv.IAdd %133, %5 : i32
        spv.Branch ^bb1(%458, %256, %323, %390, %457 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      %128 = spv.Load "Function" %127 : vector<4xf32>
      %129 = spv.Load "Function" %126 : vector<4xf32>
      %130 = spv.Load "Function" %125 : vector<4xf32>
      %131 = spv.Load "Function" %124 : vector<4xf32>
      spv.Store "Function" %40, %131 : vector<4xf32>
      spv.Store "Function" %41, %130 : vector<4xf32>
      spv.Store "Function" %42, %129 : vector<4xf32>
      spv.Store "Function" %43, %128 : vector<4xf32>
      %132 = spv.IAdd %118, %5 : i32
      spv.Branch ^bb1(%132, %131, %130, %129, %128 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
    ^bb3:  // pred: ^bb1
      spv.mlir.merge
    }
    %44 = spv.Load "Function" %43 : vector<4xf32>
    %45 = spv.Load "Function" %42 : vector<4xf32>
    %46 = spv.Load "Function" %41 : vector<4xf32>
    %47 = spv.Load "Function" %40 : vector<4xf32>
    %48 = spv.IAdd %36, %6 : i32
    %49 = spv.IAdd %22, %48 : i32
    %50 = spv.IAdd %23, %32 : i32
    %51 = spv.IAdd %24, %37 : i32
    %52 = spv.SDiv %51, %1 : i32
    %53 = spv.Constant 0 : i32
    %54 = spv.Constant 0 : i32
    %55 = spv.Constant 100352 : i32
    %56 = spv.IMul %55, %7 : i32
    %57 = spv.IAdd %54, %56 : i32
    %58 = spv.Constant 896 : i32
    %59 = spv.IMul %58, %49 : i32
    %60 = spv.IAdd %57, %59 : i32
    %61 = spv.Constant 8 : i32
    %62 = spv.IMul %61, %50 : i32
    %63 = spv.IAdd %60, %62 : i32
    %64 = spv.Constant 1 : i32
    %65 = spv.IMul %64, %52 : i32
    %66 = spv.IAdd %63, %65 : i32
    %67 = spv.AccessChain %12[%53, %66] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %67, %44 : vector<4xf32>
    %68 = spv.IAdd %36, %2 : i32
    %69 = spv.IAdd %22, %68 : i32
    %70 = spv.Constant 0 : i32
    %71 = spv.Constant 0 : i32
    %72 = spv.Constant 100352 : i32
    %73 = spv.IMul %72, %7 : i32
    %74 = spv.IAdd %71, %73 : i32
    %75 = spv.Constant 896 : i32
    %76 = spv.IMul %75, %69 : i32
    %77 = spv.IAdd %74, %76 : i32
    %78 = spv.Constant 8 : i32
    %79 = spv.IMul %78, %50 : i32
    %80 = spv.IAdd %77, %79 : i32
    %81 = spv.Constant 1 : i32
    %82 = spv.IMul %81, %52 : i32
    %83 = spv.IAdd %80, %82 : i32
    %84 = spv.AccessChain %12[%70, %83] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %84, %45 : vector<4xf32>
    %85 = spv.IAdd %36, %5 : i32
    %86 = spv.IAdd %22, %85 : i32
    %87 = spv.Constant 0 : i32
    %88 = spv.Constant 0 : i32
    %89 = spv.Constant 100352 : i32
    %90 = spv.IMul %89, %7 : i32
    %91 = spv.IAdd %88, %90 : i32
    %92 = spv.Constant 896 : i32
    %93 = spv.IMul %92, %86 : i32
    %94 = spv.IAdd %91, %93 : i32
    %95 = spv.Constant 8 : i32
    %96 = spv.IMul %95, %50 : i32
    %97 = spv.IAdd %94, %96 : i32
    %98 = spv.Constant 1 : i32
    %99 = spv.IMul %98, %52 : i32
    %100 = spv.IAdd %97, %99 : i32
    %101 = spv.AccessChain %12[%87, %100] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %101, %46 : vector<4xf32>
    %102 = spv.IAdd %22, %36 : i32
    %103 = spv.Constant 0 : i32
    %104 = spv.Constant 0 : i32
    %105 = spv.Constant 100352 : i32
    %106 = spv.IMul %105, %7 : i32
    %107 = spv.IAdd %104, %106 : i32
    %108 = spv.Constant 896 : i32
    %109 = spv.IMul %108, %102 : i32
    %110 = spv.IAdd %107, %109 : i32
    %111 = spv.Constant 8 : i32
    %112 = spv.IMul %111, %50 : i32
    %113 = spv.IAdd %110, %112 : i32
    %114 = spv.Constant 1 : i32
    %115 = spv.IMul %114, %52 : i32
    %116 = spv.IAdd %113, %115 : i32
    %117 = spv.AccessChain %12[%103, %116] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %117, %47 : vector<4xf32>
    spv.Return
  }
  spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}

// *** IR Dump After Canonicalizer ***
spv.module Logical GLSL450 {
  spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
  spv.func @conv_dispatch_0() "None" {
    %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
    %1 = spv.Constant 4 : i32
    %2 = spv.Constant 2 : i32
    %3 = spv.Constant 6 : i32
    %4 = spv.Constant 1 : i32
    %5 = spv.Constant 3 : i32
    %6 = spv.Constant 0 : i32
    %7 = spv.Constant 16 : i32
    %8 = spv.Constant 8 : i32
    %9 = spv.Constant 72 : i32
    %10 = spv.Constant 24 : i32
    %11 = spv.Constant 675 : i32
    %12 = spv.Constant 896 : i32
    %13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
    %14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %17 = spv.Load "Input" %16 : vector<3xi32>
    %18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
    %19 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %20 = spv.Load "Input" %19 : vector<3xi32>
    %21 = spv.CompositeExtract %20[1 : i32] : vector<3xi32>
    %22 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %23 = spv.Load "Input" %22 : vector<3xi32>
    %24 = spv.CompositeExtract %23[2 : i32] : vector<3xi32>
    %25 = spv.IMul %24, %1 : i32
    %26 = spv.IMul %21, %1 : i32
    %27 = spv.IMul %18, %7 : i32
    %28 = spv.IMul %24, %8 : i32
    %29 = spv.IMul %21, %8 : i32
    %30 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %31 = spv.Load "Input" %30 : vector<3xi32>
    %32 = spv.CompositeExtract %31[0 : i32] : vector<3xi32>
    %33 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %34 = spv.Load "Input" %33 : vector<3xi32>
    %35 = spv.CompositeExtract %34[1 : i32] : vector<3xi32>
    %36 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %37 = spv.Load "Input" %36 : vector<3xi32>
    %38 = spv.CompositeExtract %37[2 : i32] : vector<3xi32>
    %39 = spv.IMul %38, %1 : i32
    %40 = spv.IMul %32, %1 : i32
    %41 = spv.IMul %38, %8 : i32
    %42 = spv.IMul %35, %2 : i32
    %43 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %44 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %45 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %46 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    spv.mlir.loop {
      spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
    ^bb1(%81: i32, %82: vector<4xf32>, %83: vector<4xf32>, %84: vector<4xf32>, %85: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
      %86 = spv.SLessThan %81, %5 : i32
      spv.BranchConditional %86, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %87 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %88 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %89 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %90 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      spv.mlir.loop {
        spv.Branch ^bb1(%6, %82, %83, %84, %85 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb1(%96: i32, %97: vector<4xf32>, %98: vector<4xf32>, %99: vector<4xf32>, %100: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
        %101 = spv.SLessThan %96, %5 : i32
        spv.BranchConditional %101, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %102 = spv.IAdd %27, %40 : i32
        %103 = spv.SDiv %102, %1 : i32
        %104 = spv.IMul %81, %9 : i32
        %105 = spv.IMul %96, %10 : i32
        %106 = spv.IAdd %104, %105 : i32
        %107 = spv.IAdd %106, %103 : i32
        %108 = spv.AccessChain %14[%6, %107] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %109 = spv.Load "StorageBuffer" %108 : vector<4xf32>
        %110 = spv.IMul %81, %9 : i32
        %111 = spv.IMul %96, %10 : i32
        %112 = spv.IAdd %110, %111 : i32
        %113 = spv.IAdd %112, %8 : i32
        %114 = spv.IAdd %113, %103 : i32
        %115 = spv.AccessChain %14[%6, %114] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %116 = spv.Load "StorageBuffer" %115 : vector<4xf32>
        %117 = spv.IMul %81, %9 : i32
        %118 = spv.IMul %96, %10 : i32
        %119 = spv.IAdd %117, %118 : i32
        %120 = spv.IAdd %119, %7 : i32
        %121 = spv.IAdd %120, %103 : i32
        %122 = spv.AccessChain %14[%6, %121] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %123 = spv.Load "StorageBuffer" %122 : vector<4xf32>
        %124 = spv.IAdd %41, %81 : i32
        %125 = spv.IAdd %42, %96 : i32
        %126 = spv.IAdd %28, %124 : i32
        %127 = spv.IAdd %29, %125 : i32
        %128 = spv.IMul %126, %11 : i32
        %129 = spv.IMul %127, %5 : i32
        %130 = spv.IAdd %128, %129 : i32
        %131 = spv.AccessChain %13[%6, %130] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %132 = spv.Load "StorageBuffer" %131 : f32
        %133 = spv.IMul %126, %11 : i32
        %134 = spv.IMul %127, %5 : i32
        %135 = spv.IAdd %133, %134 : i32
        %136 = spv.IAdd %135, %4 : i32
        %137 = spv.AccessChain %13[%6, %136] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %138 = spv.Load "StorageBuffer" %137 : f32
        %139 = spv.IMul %126, %11 : i32
        %140 = spv.IMul %127, %5 : i32
        %141 = spv.IAdd %139, %140 : i32
        %142 = spv.IAdd %141, %2 : i32
        %143 = spv.AccessChain %13[%6, %142] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %144 = spv.Load "StorageBuffer" %143 : f32
        %145 = spv.CompositeConstruct %132, %138, %144 : vector<3xf32>
        %146 = spv.CompositeExtract %145[0 : i32] : vector<3xf32>
        %147 = spv.CompositeConstruct %146, %146, %146, %146 : vector<4xf32>
        %148 = spv.FMul %147, %109 : vector<4xf32>
        %149 = spv.FAdd %148, %97 : vector<4xf32>
        %150 = spv.CompositeExtract %145[1 : i32] : vector<3xf32>
        %151 = spv.CompositeConstruct %150, %150, %150, %150 : vector<4xf32>
        %152 = spv.FMul %151, %116 : vector<4xf32>
        %153 = spv.FAdd %152, %149 : vector<4xf32>
        %154 = spv.CompositeExtract %145[2 : i32] : vector<3xf32>
        %155 = spv.CompositeConstruct %154, %154, %154, %154 : vector<4xf32>
        %156 = spv.FMul %155, %123 : vector<4xf32>
        %157 = spv.FAdd %156, %153 : vector<4xf32>
        %158 = spv.IAdd %81, %2 : i32
        %159 = spv.IAdd %41, %158 : i32
        %160 = spv.IAdd %28, %159 : i32
        %161 = spv.IMul %160, %11 : i32
        %162 = spv.IMul %127, %5 : i32
        %163 = spv.IAdd %161, %162 : i32
        %164 = spv.AccessChain %13[%6, %163] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %165 = spv.Load "StorageBuffer" %164 : f32
        %166 = spv.IMul %160, %11 : i32
        %167 = spv.IMul %127, %5 : i32
        %168 = spv.IAdd %166, %167 : i32
        %169 = spv.IAdd %168, %4 : i32
        %170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %171 = spv.Load "StorageBuffer" %170 : f32
        %172 = spv.IMul %160, %11 : i32
        %173 = spv.IMul %127, %5 : i32
        %174 = spv.IAdd %172, %173 : i32
        %175 = spv.IAdd %174, %2 : i32
        %176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %177 = spv.Load "StorageBuffer" %176 : f32
        %178 = spv.CompositeConstruct %165, %171, %177 : vector<3xf32>
        %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
        %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
        %181 = spv.FMul %180, %109 : vector<4xf32>
        %182 = spv.FAdd %181, %98 : vector<4xf32>
        %183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
        %184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
        %185 = spv.FMul %184, %116 : vector<4xf32>
        %186 = spv.FAdd %185, %182 : vector<4xf32>
        %187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
        %188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
        %189 = spv.FMul %188, %123 : vector<4xf32>
        %190 = spv.FAdd %189, %186 : vector<4xf32>
        %191 = spv.IAdd %81, %1 : i32
        %192 = spv.IAdd %41, %191 : i32
        %193 = spv.IAdd %28, %192 : i32
        %194 = spv.IMul %193, %11 : i32
        %195 = spv.IMul %127, %5 : i32
        %196 = spv.IAdd %194, %195 : i32
        %197 = spv.AccessChain %13[%6, %196] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %198 = spv.Load "StorageBuffer" %197 : f32
        %199 = spv.IMul %193, %11 : i32
        %200 = spv.IMul %127, %5 : i32
        %201 = spv.IAdd %199, %200 : i32
        %202 = spv.IAdd %201, %4 : i32
        %203 = spv.AccessChain %13[%6, %202] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %204 = spv.Load "StorageBuffer" %203 : f32
        %205 = spv.IMul %193, %11 : i32
        %206 = spv.IMul %127, %5 : i32
        %207 = spv.IAdd %205, %206 : i32
        %208 = spv.IAdd %207, %2 : i32
        %209 = spv.AccessChain %13[%6, %208] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %210 = spv.Load "StorageBuffer" %209 : f32
        %211 = spv.CompositeConstruct %198, %204, %210 : vector<3xf32>
        %212 = spv.CompositeExtract %211[0 : i32] : vector<3xf32>
        %213 = spv.CompositeConstruct %212, %212, %212, %212 : vector<4xf32>
        %214 = spv.FMul %213, %109 : vector<4xf32>
        %215 = spv.FAdd %214, %99 : vector<4xf32>
        %216 = spv.CompositeExtract %211[1 : i32] : vector<3xf32>
        %217 = spv.CompositeConstruct %216, %216, %216, %216 : vector<4xf32>
        %218 = spv.FMul %217, %116 : vector<4xf32>
        %219 = spv.FAdd %218, %215 : vector<4xf32>
        %220 = spv.CompositeExtract %211[2 : i32] : vector<3xf32>
        %221 = spv.CompositeConstruct %220, %220, %220, %220 : vector<4xf32>
        %222 = spv.FMul %221, %123 : vector<4xf32>
        %223 = spv.FAdd %222, %219 : vector<4xf32>
        %224 = spv.IAdd %81, %3 : i32
        %225 = spv.IAdd %41, %224 : i32
        %226 = spv.IAdd %28, %225 : i32
        %227 = spv.IMul %226, %11 : i32
        %228 = spv.IMul %127, %5 : i32
        %229 = spv.IAdd %227, %228 : i32
        %230 = spv.AccessChain %13[%6, %229] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %231 = spv.Load "StorageBuffer" %230 : f32
        %232 = spv.IMul %226, %11 : i32
        %233 = spv.IMul %127, %5 : i32
        %234 = spv.IAdd %232, %233 : i32
        %235 = spv.IAdd %234, %4 : i32
        %236 = spv.AccessChain %13[%6, %235] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %237 = spv.Load "StorageBuffer" %236 : f32
        %238 = spv.IMul %226, %11 : i32
        %239 = spv.IMul %127, %5 : i32
        %240 = spv.IAdd %238, %239 : i32
        %241 = spv.IAdd %240, %2 : i32
        %242 = spv.AccessChain %13[%6, %241] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %243 = spv.Load "StorageBuffer" %242 : f32
        %244 = spv.CompositeConstruct %231, %237, %243 : vector<3xf32>
        %245 = spv.CompositeExtract %244[0 : i32] : vector<3xf32>
        %246 = spv.CompositeConstruct %245, %245, %245, %245 : vector<4xf32>
        %247 = spv.FMul %246, %109 : vector<4xf32>
        %248 = spv.FAdd %247, %100 : vector<4xf32>
        %249 = spv.CompositeExtract %244[1 : i32] : vector<3xf32>
        %250 = spv.CompositeConstruct %249, %249, %249, %249 : vector<4xf32>
        %251 = spv.FMul %250, %116 : vector<4xf32>
        %252 = spv.FAdd %251, %248 : vector<4xf32>
        %253 = spv.CompositeExtract %244[2 : i32] : vector<3xf32>
        %254 = spv.CompositeConstruct %253, %253, %253, %253 : vector<4xf32>
        %255 = spv.FMul %254, %123 : vector<4xf32>
        %256 = spv.FAdd %255, %252 : vector<4xf32>
        spv.Store "Function" %87, %157 : vector<4xf32>
        spv.Store "Function" %88, %190 : vector<4xf32>
        spv.Store "Function" %89, %223 : vector<4xf32>
        spv.Store "Function" %90, %256 : vector<4xf32>
        %257 = spv.IAdd %96, %4 : i32
        spv.Branch ^bb1(%257, %157, %190, %223, %256 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      %91 = spv.Load "Function" %90 : vector<4xf32>
      %92 = spv.Load "Function" %89 : vector<4xf32>
      %93 = spv.Load "Function" %88 : vector<4xf32>
      %94 = spv.Load "Function" %87 : vector<4xf32>
      spv.Store "Function" %43, %94 : vector<4xf32>
      spv.Store "Function" %44, %93 : vector<4xf32>
      spv.Store "Function" %45, %92 : vector<4xf32>
      spv.Store "Function" %46, %91 : vector<4xf32>
      %95 = spv.IAdd %81, %4 : i32
      spv.Branch ^bb1(%95, %94, %93, %92, %91 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
    ^bb3:  // pred: ^bb1
      spv.mlir.merge
    }
    %47 = spv.Load "Function" %46 : vector<4xf32>
    %48 = spv.Load "Function" %45 : vector<4xf32>
    %49 = spv.Load "Function" %44 : vector<4xf32>
    %50 = spv.Load "Function" %43 : vector<4xf32>
    %51 = spv.IAdd %39, %5 : i32
    %52 = spv.IAdd %25, %51 : i32
    %53 = spv.IAdd %26, %35 : i32
    %54 = spv.IAdd %27, %40 : i32
    %55 = spv.SDiv %54, %1 : i32
    %56 = spv.IMul %52, %12 : i32
    %57 = spv.IMul %53, %8 : i32
    %58 = spv.IAdd %56, %57 : i32
    %59 = spv.IAdd %58, %55 : i32
    %60 = spv.AccessChain %15[%6, %59] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %60, %47 : vector<4xf32>
    %61 = spv.IAdd %39, %2 : i32
    %62 = spv.IAdd %25, %61 : i32
    %63 = spv.IMul %62, %12 : i32
    %64 = spv.IMul %53, %8 : i32
    %65 = spv.IAdd %63, %64 : i32
    %66 = spv.IAdd %65, %55 : i32
    %67 = spv.AccessChain %15[%6, %66] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %67, %48 : vector<4xf32>
    %68 = spv.IAdd %39, %4 : i32
    %69 = spv.IAdd %25, %68 : i32
    %70 = spv.IMul %69, %12 : i32
    %71 = spv.IMul %53, %8 : i32
    %72 = spv.IAdd %70, %71 : i32
    %73 = spv.IAdd %72, %55 : i32
    %74 = spv.AccessChain %15[%6, %73] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %74, %49 : vector<4xf32>
    %75 = spv.IAdd %25, %39 : i32
    %76 = spv.IMul %75, %12 : i32
    %77 = spv.IMul %53, %8 : i32
    %78 = spv.IAdd %76, %77 : i32
    %79 = spv.IAdd %78, %55 : i32
    %80 = spv.AccessChain %15[%6, %79] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %80, %50 : vector<4xf32>
    spv.Return
  }
  spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}

// *** IR Dump After CSE ***
spv.module Logical GLSL450 {
  spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
  spv.func @conv_dispatch_0() "None" {
    %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
    %1 = spv.Constant 4 : i32
    %2 = spv.Constant 2 : i32
    %3 = spv.Constant 6 : i32
    %4 = spv.Constant 1 : i32
    %5 = spv.Constant 3 : i32
    %6 = spv.Constant 0 : i32
    %7 = spv.Constant 16 : i32
    %8 = spv.Constant 8 : i32
    %9 = spv.Constant 72 : i32
    %10 = spv.Constant 24 : i32
    %11 = spv.Constant 675 : i32
    %12 = spv.Constant 896 : i32
    %13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
    %14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %17 = spv.Load "Input" %16 : vector<3xi32>
    %18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
    %19 = spv.Load "Input" %16 : vector<3xi32>
    %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
    %21 = spv.Load "Input" %16 : vector<3xi32>
    %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
    %23 = spv.IMul %22, %1 : i32
    %24 = spv.IMul %20, %1 : i32
    %25 = spv.IMul %18, %7 : i32
    %26 = spv.IMul %22, %8 : i32
    %27 = spv.IMul %20, %8 : i32
    %28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %29 = spv.Load "Input" %28 : vector<3xi32>
    %30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
    %31 = spv.Load "Input" %28 : vector<3xi32>
    %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
    %33 = spv.Load "Input" %28 : vector<3xi32>
    %34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
    %35 = spv.IMul %34, %1 : i32
    %36 = spv.IMul %30, %1 : i32
    %37 = spv.IMul %34, %8 : i32
    %38 = spv.IMul %32, %2 : i32
    %39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    spv.mlir.loop {
      spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
    ^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
      %79 = spv.SLessThan %74, %5 : i32
      spv.BranchConditional %79, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      spv.mlir.loop {
        spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
        %94 = spv.SLessThan %89, %5 : i32
        spv.BranchConditional %94, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %95 = spv.IAdd %25, %36 : i32
        %96 = spv.SDiv %95, %1 : i32
        %97 = spv.IMul %74, %9 : i32
        %98 = spv.IMul %89, %10 : i32
        %99 = spv.IAdd %97, %98 : i32
        %100 = spv.IAdd %99, %96 : i32
        %101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
        %103 = spv.IAdd %99, %8 : i32
        %104 = spv.IAdd %103, %96 : i32
        %105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
        %107 = spv.IAdd %99, %7 : i32
        %108 = spv.IAdd %107, %96 : i32
        %109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
        %111 = spv.IAdd %37, %74 : i32
        %112 = spv.IAdd %38, %89 : i32
        %113 = spv.IAdd %26, %111 : i32
        %114 = spv.IAdd %27, %112 : i32
        %115 = spv.IMul %113, %11 : i32
        %116 = spv.IMul %114, %5 : i32
        %117 = spv.IAdd %115, %116 : i32
        %118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %119 = spv.Load "StorageBuffer" %118 : f32
        %120 = spv.IAdd %117, %4 : i32
        %121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %122 = spv.Load "StorageBuffer" %121 : f32
        %123 = spv.IAdd %117, %2 : i32
        %124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %125 = spv.Load "StorageBuffer" %124 : f32
        %126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
        %127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
        %128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
        %129 = spv.FMul %128, %102 : vector<4xf32>
        %130 = spv.FAdd %129, %90 : vector<4xf32>
        %131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
        %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
        %133 = spv.FMul %132, %106 : vector<4xf32>
        %134 = spv.FAdd %133, %130 : vector<4xf32>
        %135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
        %136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
        %137 = spv.FMul %136, %110 : vector<4xf32>
        %138 = spv.FAdd %137, %134 : vector<4xf32>
        %139 = spv.IAdd %74, %2 : i32
        %140 = spv.IAdd %37, %139 : i32
        %141 = spv.IAdd %26, %140 : i32
        %142 = spv.IMul %141, %11 : i32
        %143 = spv.IAdd %142, %116 : i32
        %144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %145 = spv.Load "StorageBuffer" %144 : f32
        %146 = spv.IAdd %143, %4 : i32
        %147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %148 = spv.Load "StorageBuffer" %147 : f32
        %149 = spv.IAdd %143, %2 : i32
        %150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %151 = spv.Load "StorageBuffer" %150 : f32
        %152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
        %153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
        %154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
        %155 = spv.FMul %154, %102 : vector<4xf32>
        %156 = spv.FAdd %155, %91 : vector<4xf32>
        %157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
        %158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
        %159 = spv.FMul %158, %106 : vector<4xf32>
        %160 = spv.FAdd %159, %156 : vector<4xf32>
        %161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
        %162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
        %163 = spv.FMul %162, %110 : vector<4xf32>
        %164 = spv.FAdd %163, %160 : vector<4xf32>
        %165 = spv.IAdd %74, %1 : i32
        %166 = spv.IAdd %37, %165 : i32
        %167 = spv.IAdd %26, %166 : i32
        %168 = spv.IMul %167, %11 : i32
        %169 = spv.IAdd %168, %116 : i32
        %170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %171 = spv.Load "StorageBuffer" %170 : f32
        %172 = spv.IAdd %169, %4 : i32
        %173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %174 = spv.Load "StorageBuffer" %173 : f32
        %175 = spv.IAdd %169, %2 : i32
        %176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %177 = spv.Load "StorageBuffer" %176 : f32
        %178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
        %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
        %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
        %181 = spv.FMul %180, %102 : vector<4xf32>
        %182 = spv.FAdd %181, %92 : vector<4xf32>
        %183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
        %184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
        %185 = spv.FMul %184, %106 : vector<4xf32>
        %186 = spv.FAdd %185, %182 : vector<4xf32>
        %187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
        %188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
        %189 = spv.FMul %188, %110 : vector<4xf32>
        %190 = spv.FAdd %189, %186 : vector<4xf32>
        %191 = spv.IAdd %74, %3 : i32
        %192 = spv.IAdd %37, %191 : i32
        %193 = spv.IAdd %26, %192 : i32
        %194 = spv.IMul %193, %11 : i32
        %195 = spv.IAdd %194, %116 : i32
        %196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %197 = spv.Load "StorageBuffer" %196 : f32
        %198 = spv.IAdd %195, %4 : i32
        %199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %200 = spv.Load "StorageBuffer" %199 : f32
        %201 = spv.IAdd %195, %2 : i32
        %202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %203 = spv.Load "StorageBuffer" %202 : f32
        %204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
        %205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
        %206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
        %207 = spv.FMul %206, %102 : vector<4xf32>
        %208 = spv.FAdd %207, %93 : vector<4xf32>
        %209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
        %210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
        %211 = spv.FMul %210, %106 : vector<4xf32>
        %212 = spv.FAdd %211, %208 : vector<4xf32>
        %213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
        %214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
        %215 = spv.FMul %214, %110 : vector<4xf32>
        %216 = spv.FAdd %215, %212 : vector<4xf32>
        spv.Store "Function" %80, %138 : vector<4xf32>
        spv.Store "Function" %81, %164 : vector<4xf32>
        spv.Store "Function" %82, %190 : vector<4xf32>
        spv.Store "Function" %83, %216 : vector<4xf32>
        %217 = spv.IAdd %89, %4 : i32
        spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      %84 = spv.Load "Function" %83 : vector<4xf32>
      %85 = spv.Load "Function" %82 : vector<4xf32>
      %86 = spv.Load "Function" %81 : vector<4xf32>
      %87 = spv.Load "Function" %80 : vector<4xf32>
      spv.Store "Function" %39, %87 : vector<4xf32>
      spv.Store "Function" %40, %86 : vector<4xf32>
      spv.Store "Function" %41, %85 : vector<4xf32>
      spv.Store "Function" %42, %84 : vector<4xf32>
      %88 = spv.IAdd %74, %4 : i32
      spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
    ^bb3:  // pred: ^bb1
      spv.mlir.merge
    }
    %43 = spv.Load "Function" %42 : vector<4xf32>
    %44 = spv.Load "Function" %41 : vector<4xf32>
    %45 = spv.Load "Function" %40 : vector<4xf32>
    %46 = spv.Load "Function" %39 : vector<4xf32>
    %47 = spv.IAdd %35, %5 : i32
    %48 = spv.IAdd %23, %47 : i32
    %49 = spv.IAdd %24, %32 : i32
    %50 = spv.IAdd %25, %36 : i32
    %51 = spv.SDiv %50, %1 : i32
    %52 = spv.IMul %48, %12 : i32
    %53 = spv.IMul %49, %8 : i32
    %54 = spv.IAdd %52, %53 : i32
    %55 = spv.IAdd %54, %51 : i32
    %56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
    %57 = spv.IAdd %35, %2 : i32
    %58 = spv.IAdd %23, %57 : i32
    %59 = spv.IMul %58, %12 : i32
    %60 = spv.IAdd %59, %53 : i32
    %61 = spv.IAdd %60, %51 : i32
    %62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
    %63 = spv.IAdd %35, %4 : i32
    %64 = spv.IAdd %23, %63 : i32
    %65 = spv.IMul %64, %12 : i32
    %66 = spv.IAdd %65, %53 : i32
    %67 = spv.IAdd %66, %51 : i32
    %68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
    %69 = spv.IAdd %23, %35 : i32
    %70 = spv.IMul %69, %12 : i32
    %71 = spv.IAdd %70, %53 : i32
    %72 = spv.IAdd %71, %51 : i32
    %73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
    spv.Return
  }
  spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}

// *** IR Dump After SPIRVUpdateVCE ***
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
  spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
  spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
  spv.func @conv_dispatch_0() "None" {
    %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
    %1 = spv.Constant 4 : i32
    %2 = spv.Constant 2 : i32
    %3 = spv.Constant 6 : i32
    %4 = spv.Constant 1 : i32
    %5 = spv.Constant 3 : i32
    %6 = spv.Constant 0 : i32
    %7 = spv.Constant 16 : i32
    %8 = spv.Constant 8 : i32
    %9 = spv.Constant 72 : i32
    %10 = spv.Constant 24 : i32
    %11 = spv.Constant 675 : i32
    %12 = spv.Constant 896 : i32
    %13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
    %14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
    %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %17 = spv.Load "Input" %16 : vector<3xi32>
    %18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
    %19 = spv.Load "Input" %16 : vector<3xi32>
    %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
    %21 = spv.Load "Input" %16 : vector<3xi32>
    %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
    %23 = spv.IMul %22, %1 : i32
    %24 = spv.IMul %20, %1 : i32
    %25 = spv.IMul %18, %7 : i32
    %26 = spv.IMul %22, %8 : i32
    %27 = spv.IMul %20, %8 : i32
    %28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %29 = spv.Load "Input" %28 : vector<3xi32>
    %30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
    %31 = spv.Load "Input" %28 : vector<3xi32>
    %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
    %33 = spv.Load "Input" %28 : vector<3xi32>
    %34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
    %35 = spv.IMul %34, %1 : i32
    %36 = spv.IMul %30, %1 : i32
    %37 = spv.IMul %34, %8 : i32
    %38 = spv.IMul %32, %2 : i32
    %39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
    spv.mlir.loop {
      spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
    ^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
      %79 = spv.SLessThan %74, %5 : i32
      spv.BranchConditional %79, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      %83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
      spv.mlir.loop {
        spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
        %94 = spv.SLessThan %89, %5 : i32
        spv.BranchConditional %94, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %95 = spv.IAdd %25, %36 : i32
        %96 = spv.SDiv %95, %1 : i32
        %97 = spv.IMul %74, %9 : i32
        %98 = spv.IMul %89, %10 : i32
        %99 = spv.IAdd %97, %98 : i32
        %100 = spv.IAdd %99, %96 : i32
        %101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
        %103 = spv.IAdd %99, %8 : i32
        %104 = spv.IAdd %103, %96 : i32
        %105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
        %107 = spv.IAdd %99, %7 : i32
        %108 = spv.IAdd %107, %96 : i32
        %109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
        %111 = spv.IAdd %37, %74 : i32
        %112 = spv.IAdd %38, %89 : i32
        %113 = spv.IAdd %26, %111 : i32
        %114 = spv.IAdd %27, %112 : i32
        %115 = spv.IMul %113, %11 : i32
        %116 = spv.IMul %114, %5 : i32
        %117 = spv.IAdd %115, %116 : i32
        %118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %119 = spv.Load "StorageBuffer" %118 : f32
        %120 = spv.IAdd %117, %4 : i32
        %121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %122 = spv.Load "StorageBuffer" %121 : f32
        %123 = spv.IAdd %117, %2 : i32
        %124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %125 = spv.Load "StorageBuffer" %124 : f32
        %126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
        %127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
        %128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
        %129 = spv.FMul %128, %102 : vector<4xf32>
        %130 = spv.FAdd %129, %90 : vector<4xf32>
        %131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
        %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
        %133 = spv.FMul %132, %106 : vector<4xf32>
        %134 = spv.FAdd %133, %130 : vector<4xf32>
        %135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
        %136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
        %137 = spv.FMul %136, %110 : vector<4xf32>
        %138 = spv.FAdd %137, %134 : vector<4xf32>
        %139 = spv.IAdd %74, %2 : i32
        %140 = spv.IAdd %37, %139 : i32
        %141 = spv.IAdd %26, %140 : i32
        %142 = spv.IMul %141, %11 : i32
        %143 = spv.IAdd %142, %116 : i32
        %144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %145 = spv.Load "StorageBuffer" %144 : f32
        %146 = spv.IAdd %143, %4 : i32
        %147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %148 = spv.Load "StorageBuffer" %147 : f32
        %149 = spv.IAdd %143, %2 : i32
        %150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %151 = spv.Load "StorageBuffer" %150 : f32
        %152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
        %153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
        %154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
        %155 = spv.FMul %154, %102 : vector<4xf32>
        %156 = spv.FAdd %155, %91 : vector<4xf32>
        %157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
        %158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
        %159 = spv.FMul %158, %106 : vector<4xf32>
        %160 = spv.FAdd %159, %156 : vector<4xf32>
        %161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
        %162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
        %163 = spv.FMul %162, %110 : vector<4xf32>
        %164 = spv.FAdd %163, %160 : vector<4xf32>
        %165 = spv.IAdd %74, %1 : i32
        %166 = spv.IAdd %37, %165 : i32
        %167 = spv.IAdd %26, %166 : i32
        %168 = spv.IMul %167, %11 : i32
        %169 = spv.IAdd %168, %116 : i32
        %170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %171 = spv.Load "StorageBuffer" %170 : f32
        %172 = spv.IAdd %169, %4 : i32
        %173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %174 = spv.Load "StorageBuffer" %173 : f32
        %175 = spv.IAdd %169, %2 : i32
        %176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %177 = spv.Load "StorageBuffer" %176 : f32
        %178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
        %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
        %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
        %181 = spv.FMul %180, %102 : vector<4xf32>
        %182 = spv.FAdd %181, %92 : vector<4xf32>
        %183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
        %184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
        %185 = spv.FMul %184, %106 : vector<4xf32>
        %186 = spv.FAdd %185, %182 : vector<4xf32>
        %187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
        %188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
        %189 = spv.FMul %188, %110 : vector<4xf32>
        %190 = spv.FAdd %189, %186 : vector<4xf32>
        %191 = spv.IAdd %74, %3 : i32
        %192 = spv.IAdd %37, %191 : i32
        %193 = spv.IAdd %26, %192 : i32
        %194 = spv.IMul %193, %11 : i32
        %195 = spv.IAdd %194, %116 : i32
        %196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %197 = spv.Load "StorageBuffer" %196 : f32
        %198 = spv.IAdd %195, %4 : i32
        %199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %200 = spv.Load "StorageBuffer" %199 : f32
        %201 = spv.IAdd %195, %2 : i32
        %202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
        %203 = spv.Load "StorageBuffer" %202 : f32
        %204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
        %205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
        %206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
        %207 = spv.FMul %206, %102 : vector<4xf32>
        %208 = spv.FAdd %207, %93 : vector<4xf32>
        %209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
        %210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
        %211 = spv.FMul %210, %106 : vector<4xf32>
        %212 = spv.FAdd %211, %208 : vector<4xf32>
        %213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
        %214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
        %215 = spv.FMul %214, %110 : vector<4xf32>
        %216 = spv.FAdd %215, %212 : vector<4xf32>
        spv.Store "Function" %80, %138 : vector<4xf32>
        spv.Store "Function" %81, %164 : vector<4xf32>
        spv.Store "Function" %82, %190 : vector<4xf32>
        spv.Store "Function" %83, %216 : vector<4xf32>
        %217 = spv.IAdd %89, %4 : i32
        spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
      ^bb3:  // pred: ^bb1
        spv.mlir.merge
      }
      %84 = spv.Load "Function" %83 : vector<4xf32>
      %85 = spv.Load "Function" %82 : vector<4xf32>
      %86 = spv.Load "Function" %81 : vector<4xf32>
      %87 = spv.Load "Function" %80 : vector<4xf32>
      spv.Store "Function" %39, %87 : vector<4xf32>
      spv.Store "Function" %40, %86 : vector<4xf32>
      spv.Store "Function" %41, %85 : vector<4xf32>
      spv.Store "Function" %42, %84 : vector<4xf32>
      %88 = spv.IAdd %74, %4 : i32
      spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
    ^bb3:  // pred: ^bb1
      spv.mlir.merge
    }
    %43 = spv.Load "Function" %42 : vector<4xf32>
    %44 = spv.Load "Function" %41 : vector<4xf32>
    %45 = spv.Load "Function" %40 : vector<4xf32>
    %46 = spv.Load "Function" %39 : vector<4xf32>
    %47 = spv.IAdd %35, %5 : i32
    %48 = spv.IAdd %23, %47 : i32
    %49 = spv.IAdd %24, %32 : i32
    %50 = spv.IAdd %25, %36 : i32
    %51 = spv.SDiv %50, %1 : i32
    %52 = spv.IMul %48, %12 : i32
    %53 = spv.IMul %49, %8 : i32
    %54 = spv.IAdd %52, %53 : i32
    %55 = spv.IAdd %54, %51 : i32
    %56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
    %57 = spv.IAdd %35, %2 : i32
    %58 = spv.IAdd %23, %57 : i32
    %59 = spv.IMul %58, %12 : i32
    %60 = spv.IAdd %59, %53 : i32
    %61 = spv.IAdd %60, %51 : i32
    %62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
    %63 = spv.IAdd %35, %4 : i32
    %64 = spv.IAdd %23, %63 : i32
    %65 = spv.IMul %64, %12 : i32
    %66 = spv.IAdd %65, %53 : i32
    %67 = spv.IAdd %66, %51 : i32
    %68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
    %69 = spv.IAdd %23, %35 : i32
    %70 = spv.IMul %69, %12 : i32
    %71 = spv.IAdd %70, %53 : i32
    %72 = spv.IAdd %71, %51 : i32
    %73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
    spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
    spv.Return
  }
  spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass ***
hal.executable.target @vulkan_spirv, filter="vulkan*" {
  hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
  ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
    %c2 = constant 2 : index
    %c28 = constant 28 : index
    %c28_0 = constant 28 : index
    hal.return %c2, %c28, %c28_0 : index, index, index
  }
  module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
    spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
      spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
      spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
      spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
      spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
      spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
      spv.func @conv_dispatch_0() "None" {
        %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
        %1 = spv.Constant 4 : i32
        %2 = spv.Constant 2 : i32
        %3 = spv.Constant 6 : i32
        %4 = spv.Constant 1 : i32
        %5 = spv.Constant 3 : i32
        %6 = spv.Constant 0 : i32
        %7 = spv.Constant 16 : i32
        %8 = spv.Constant 8 : i32
        %9 = spv.Constant 72 : i32
        %10 = spv.Constant 24 : i32
        %11 = spv.Constant 675 : i32
        %12 = spv.Constant 896 : i32
        %13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
        %14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
        %15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
        %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
        %17 = spv.Load "Input" %16 : vector<3xi32>
        %18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
        %19 = spv.Load "Input" %16 : vector<3xi32>
        %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
        %21 = spv.Load "Input" %16 : vector<3xi32>
        %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
        %23 = spv.IMul %22, %1 : i32
        %24 = spv.IMul %20, %1 : i32
        %25 = spv.IMul %18, %7 : i32
        %26 = spv.IMul %22, %8 : i32
        %27 = spv.IMul %20, %8 : i32
        %28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %29 = spv.Load "Input" %28 : vector<3xi32>
        %30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
        %31 = spv.Load "Input" %28 : vector<3xi32>
        %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
        %33 = spv.Load "Input" %28 : vector<3xi32>
        %34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
        %35 = spv.IMul %34, %1 : i32
        %36 = spv.IMul %30, %1 : i32
        %37 = spv.IMul %34, %8 : i32
        %38 = spv.IMul %32, %2 : i32
        %39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
        spv.mlir.loop {
          spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
          %79 = spv.SLessThan %74, %5 : i32
          spv.BranchConditional %79, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          %81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          %82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          %83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
          spv.mlir.loop {
            spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
          ^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
            %94 = spv.SLessThan %89, %5 : i32
            spv.BranchConditional %94, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %95 = spv.IAdd %25, %36 : i32
            %96 = spv.SDiv %95, %1 : i32
            %97 = spv.IMul %74, %9 : i32
            %98 = spv.IMul %89, %10 : i32
            %99 = spv.IAdd %97, %98 : i32
            %100 = spv.IAdd %99, %96 : i32
            %101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            %102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
            %103 = spv.IAdd %99, %8 : i32
            %104 = spv.IAdd %103, %96 : i32
            %105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            %106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
            %107 = spv.IAdd %99, %7 : i32
            %108 = spv.IAdd %107, %96 : i32
            %109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
            %111 = spv.IAdd %37, %74 : i32
            %112 = spv.IAdd %38, %89 : i32
            %113 = spv.IAdd %26, %111 : i32
            %114 = spv.IAdd %27, %112 : i32
            %115 = spv.IMul %113, %11 : i32
            %116 = spv.IMul %114, %5 : i32
            %117 = spv.IAdd %115, %116 : i32
            %118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %119 = spv.Load "StorageBuffer" %118 : f32
            %120 = spv.IAdd %117, %4 : i32
            %121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %122 = spv.Load "StorageBuffer" %121 : f32
            %123 = spv.IAdd %117, %2 : i32
            %124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %125 = spv.Load "StorageBuffer" %124 : f32
            %126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
            %127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
            %128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
            %129 = spv.FMul %128, %102 : vector<4xf32>
            %130 = spv.FAdd %129, %90 : vector<4xf32>
            %131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
            %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
            %133 = spv.FMul %132, %106 : vector<4xf32>
            %134 = spv.FAdd %133, %130 : vector<4xf32>
            %135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
            %136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
            %137 = spv.FMul %136, %110 : vector<4xf32>
            %138 = spv.FAdd %137, %134 : vector<4xf32>
            %139 = spv.IAdd %74, %2 : i32
            %140 = spv.IAdd %37, %139 : i32
            %141 = spv.IAdd %26, %140 : i32
            %142 = spv.IMul %141, %11 : i32
            %143 = spv.IAdd %142, %116 : i32
            %144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %145 = spv.Load "StorageBuffer" %144 : f32
            %146 = spv.IAdd %143, %4 : i32
            %147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %148 = spv.Load "StorageBuffer" %147 : f32
            %149 = spv.IAdd %143, %2 : i32
            %150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %151 = spv.Load "StorageBuffer" %150 : f32
            %152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
            %153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
            %154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
            %155 = spv.FMul %154, %102 : vector<4xf32>
            %156 = spv.FAdd %155, %91 : vector<4xf32>
            %157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
            %158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
            %159 = spv.FMul %158, %106 : vector<4xf32>
            %160 = spv.FAdd %159, %156 : vector<4xf32>
            %161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
            %162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
            %163 = spv.FMul %162, %110 : vector<4xf32>
            %164 = spv.FAdd %163, %160 : vector<4xf32>
            %165 = spv.IAdd %74, %1 : i32
            %166 = spv.IAdd %37, %165 : i32
            %167 = spv.IAdd %26, %166 : i32
            %168 = spv.IMul %167, %11 : i32
            %169 = spv.IAdd %168, %116 : i32
            %170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %171 = spv.Load "StorageBuffer" %170 : f32
            %172 = spv.IAdd %169, %4 : i32
            %173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %174 = spv.Load "StorageBuffer" %173 : f32
            %175 = spv.IAdd %169, %2 : i32
            %176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %177 = spv.Load "StorageBuffer" %176 : f32
            %178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
            %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
            %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
            %181 = spv.FMul %180, %102 : vector<4xf32>
            %182 = spv.FAdd %181, %92 : vector<4xf32>
            %183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
            %184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
            %185 = spv.FMul %184, %106 : vector<4xf32>
            %186 = spv.FAdd %185, %182 : vector<4xf32>
            %187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
            %188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
            %189 = spv.FMul %188, %110 : vector<4xf32>
            %190 = spv.FAdd %189, %186 : vector<4xf32>
            %191 = spv.IAdd %74, %3 : i32
            %192 = spv.IAdd %37, %191 : i32
            %193 = spv.IAdd %26, %192 : i32
            %194 = spv.IMul %193, %11 : i32
            %195 = spv.IAdd %194, %116 : i32
            %196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %197 = spv.Load "StorageBuffer" %196 : f32
            %198 = spv.IAdd %195, %4 : i32
            %199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %200 = spv.Load "StorageBuffer" %199 : f32
            %201 = spv.IAdd %195, %2 : i32
            %202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
            %203 = spv.Load "StorageBuffer" %202 : f32
            %204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
            %205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
            %206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
            %207 = spv.FMul %206, %102 : vector<4xf32>
            %208 = spv.FAdd %207, %93 : vector<4xf32>
            %209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
            %210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
            %211 = spv.FMul %210, %106 : vector<4xf32>
            %212 = spv.FAdd %211, %208 : vector<4xf32>
            %213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
            %214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
            %215 = spv.FMul %214, %110 : vector<4xf32>
            %216 = spv.FAdd %215, %212 : vector<4xf32>
            spv.Store "Function" %80, %138 : vector<4xf32>
            spv.Store "Function" %81, %164 : vector<4xf32>
            spv.Store "Function" %82, %190 : vector<4xf32>
            spv.Store "Function" %83, %216 : vector<4xf32>
            %217 = spv.IAdd %89, %4 : i32
            spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
          ^bb3:  // pred: ^bb1
            spv.mlir.merge
          }
          %84 = spv.Load "Function" %83 : vector<4xf32>
          %85 = spv.Load "Function" %82 : vector<4xf32>
          %86 = spv.Load "Function" %81 : vector<4xf32>
          %87 = spv.Load "Function" %80 : vector<4xf32>
          spv.Store "Function" %39, %87 : vector<4xf32>
          spv.Store "Function" %40, %86 : vector<4xf32>
          spv.Store "Function" %41, %85 : vector<4xf32>
          spv.Store "Function" %42, %84 : vector<4xf32>
          %88 = spv.IAdd %74, %4 : i32
          spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
        ^bb3:  // pred: ^bb1
          spv.mlir.merge
        }
        %43 = spv.Load "Function" %42 : vector<4xf32>
        %44 = spv.Load "Function" %41 : vector<4xf32>
        %45 = spv.Load "Function" %40 : vector<4xf32>
        %46 = spv.Load "Function" %39 : vector<4xf32>
        %47 = spv.IAdd %35, %5 : i32
        %48 = spv.IAdd %23, %47 : i32
        %49 = spv.IAdd %24, %32 : i32
        %50 = spv.IAdd %25, %36 : i32
        %51 = spv.SDiv %50, %1 : i32
        %52 = spv.IMul %48, %12 : i32
        %53 = spv.IMul %49, %8 : i32
        %54 = spv.IAdd %52, %53 : i32
        %55 = spv.IAdd %54, %51 : i32
        %56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
        %57 = spv.IAdd %35, %2 : i32
        %58 = spv.IAdd %23, %57 : i32
        %59 = spv.IMul %58, %12 : i32
        %60 = spv.IAdd %59, %53 : i32
        %61 = spv.IAdd %60, %51 : i32
        %62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
        %63 = spv.IAdd %35, %4 : i32
        %64 = spv.IAdd %23, %63 : i32
        %65 = spv.IMul %64, %12 : i32
        %66 = spv.IAdd %65, %53 : i32
        %67 = spv.IAdd %66, %51 : i32
        %68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
        %69 = spv.IAdd %23, %35 : i32
        %70 = spv.IMul %69, %12 : i32
        %71 = spv.IAdd %70, %53 : i32
        %72 = spv.IAdd %71, %51 : i32
        %73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
        spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
        spv.Return
      }
      spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
      spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
    }
    hal.interface @legacy_io attributes {sym_visibility = "private"} {
      hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
  }
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::ConvertToHALPass ***
module  {
  hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
    hal.interface @legacy_io {
      hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.target @vulkan_spirv, filter="vulkan*" {
      hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c2 = constant 2 : index
        %c28 = constant 28 : index
        %c28_0 = constant 28 : index
        hal.return %c2, %c28, %c28_0 : index, index, index
      }
      module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %1 = spv.Constant 4 : i32
            %2 = spv.Constant 2 : i32
            %3 = spv.Constant 6 : i32
            %4 = spv.Constant 1 : i32
            %5 = spv.Constant 3 : i32
            %6 = spv.Constant 0 : i32
            %7 = spv.Constant 16 : i32
            %8 = spv.Constant 8 : i32
            %9 = spv.Constant 72 : i32
            %10 = spv.Constant 24 : i32
            %11 = spv.Constant 675 : i32
            %12 = spv.Constant 896 : i32
            %13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
            %14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %17 = spv.Load "Input" %16 : vector<3xi32>
            %18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
            %19 = spv.Load "Input" %16 : vector<3xi32>
            %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
            %21 = spv.Load "Input" %16 : vector<3xi32>
            %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
            %23 = spv.IMul %22, %1 : i32
            %24 = spv.IMul %20, %1 : i32
            %25 = spv.IMul %18, %7 : i32
            %26 = spv.IMul %22, %8 : i32
            %27 = spv.IMul %20, %8 : i32
            %28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %29 = spv.Load "Input" %28 : vector<3xi32>
            %30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
            %31 = spv.Load "Input" %28 : vector<3xi32>
            %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
            %33 = spv.Load "Input" %28 : vector<3xi32>
            %34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
            %35 = spv.IMul %34, %1 : i32
            %36 = spv.IMul %30, %1 : i32
            %37 = spv.IMul %34, %8 : i32
            %38 = spv.IMul %32, %2 : i32
            %39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            spv.mlir.loop {
              spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
              %79 = spv.SLessThan %74, %5 : i32
              spv.BranchConditional %79, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %94 = spv.SLessThan %89, %5 : i32
                spv.BranchConditional %94, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %95 = spv.IAdd %25, %36 : i32
                %96 = spv.SDiv %95, %1 : i32
                %97 = spv.IMul %74, %9 : i32
                %98 = spv.IMul %89, %10 : i32
                %99 = spv.IAdd %97, %98 : i32
                %100 = spv.IAdd %99, %96 : i32
                %101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
                %103 = spv.IAdd %99, %8 : i32
                %104 = spv.IAdd %103, %96 : i32
                %105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
                %107 = spv.IAdd %99, %7 : i32
                %108 = spv.IAdd %107, %96 : i32
                %109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
                %111 = spv.IAdd %37, %74 : i32
                %112 = spv.IAdd %38, %89 : i32
                %113 = spv.IAdd %26, %111 : i32
                %114 = spv.IAdd %27, %112 : i32
                %115 = spv.IMul %113, %11 : i32
                %116 = spv.IMul %114, %5 : i32
                %117 = spv.IAdd %115, %116 : i32
                %118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %119 = spv.Load "StorageBuffer" %118 : f32
                %120 = spv.IAdd %117, %4 : i32
                %121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %122 = spv.Load "StorageBuffer" %121 : f32
                %123 = spv.IAdd %117, %2 : i32
                %124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %125 = spv.Load "StorageBuffer" %124 : f32
                %126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
                %127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
                %128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
                %129 = spv.FMul %128, %102 : vector<4xf32>
                %130 = spv.FAdd %129, %90 : vector<4xf32>
                %131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
                %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                %133 = spv.FMul %132, %106 : vector<4xf32>
                %134 = spv.FAdd %133, %130 : vector<4xf32>
                %135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
                %136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
                %137 = spv.FMul %136, %110 : vector<4xf32>
                %138 = spv.FAdd %137, %134 : vector<4xf32>
                %139 = spv.IAdd %74, %2 : i32
                %140 = spv.IAdd %37, %139 : i32
                %141 = spv.IAdd %26, %140 : i32
                %142 = spv.IMul %141, %11 : i32
                %143 = spv.IAdd %142, %116 : i32
                %144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %145 = spv.Load "StorageBuffer" %144 : f32
                %146 = spv.IAdd %143, %4 : i32
                %147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %148 = spv.Load "StorageBuffer" %147 : f32
                %149 = spv.IAdd %143, %2 : i32
                %150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %151 = spv.Load "StorageBuffer" %150 : f32
                %152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
                %153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
                %154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
                %155 = spv.FMul %154, %102 : vector<4xf32>
                %156 = spv.FAdd %155, %91 : vector<4xf32>
                %157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
                %158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
                %159 = spv.FMul %158, %106 : vector<4xf32>
                %160 = spv.FAdd %159, %156 : vector<4xf32>
                %161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
                %162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
                %163 = spv.FMul %162, %110 : vector<4xf32>
                %164 = spv.FAdd %163, %160 : vector<4xf32>
                %165 = spv.IAdd %74, %1 : i32
                %166 = spv.IAdd %37, %165 : i32
                %167 = spv.IAdd %26, %166 : i32
                %168 = spv.IMul %167, %11 : i32
                %169 = spv.IAdd %168, %116 : i32
                %170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %171 = spv.Load "StorageBuffer" %170 : f32
                %172 = spv.IAdd %169, %4 : i32
                %173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %174 = spv.Load "StorageBuffer" %173 : f32
                %175 = spv.IAdd %169, %2 : i32
                %176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %177 = spv.Load "StorageBuffer" %176 : f32
                %178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
                %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
                %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                %181 = spv.FMul %180, %102 : vector<4xf32>
                %182 = spv.FAdd %181, %92 : vector<4xf32>
                %183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
                %184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
                %185 = spv.FMul %184, %106 : vector<4xf32>
                %186 = spv.FAdd %185, %182 : vector<4xf32>
                %187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
                %188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
                %189 = spv.FMul %188, %110 : vector<4xf32>
                %190 = spv.FAdd %189, %186 : vector<4xf32>
                %191 = spv.IAdd %74, %3 : i32
                %192 = spv.IAdd %37, %191 : i32
                %193 = spv.IAdd %26, %192 : i32
                %194 = spv.IMul %193, %11 : i32
                %195 = spv.IAdd %194, %116 : i32
                %196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %197 = spv.Load "StorageBuffer" %196 : f32
                %198 = spv.IAdd %195, %4 : i32
                %199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %200 = spv.Load "StorageBuffer" %199 : f32
                %201 = spv.IAdd %195, %2 : i32
                %202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %203 = spv.Load "StorageBuffer" %202 : f32
                %204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
                %205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
                %206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
                %207 = spv.FMul %206, %102 : vector<4xf32>
                %208 = spv.FAdd %207, %93 : vector<4xf32>
                %209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
                %210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
                %211 = spv.FMul %210, %106 : vector<4xf32>
                %212 = spv.FAdd %211, %208 : vector<4xf32>
                %213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
                %214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
                %215 = spv.FMul %214, %110 : vector<4xf32>
                %216 = spv.FAdd %215, %212 : vector<4xf32>
                spv.Store "Function" %80, %138 : vector<4xf32>
                spv.Store "Function" %81, %164 : vector<4xf32>
                spv.Store "Function" %82, %190 : vector<4xf32>
                spv.Store "Function" %83, %216 : vector<4xf32>
                %217 = spv.IAdd %89, %4 : i32
                spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %84 = spv.Load "Function" %83 : vector<4xf32>
              %85 = spv.Load "Function" %82 : vector<4xf32>
              %86 = spv.Load "Function" %81 : vector<4xf32>
              %87 = spv.Load "Function" %80 : vector<4xf32>
              spv.Store "Function" %39, %87 : vector<4xf32>
              spv.Store "Function" %40, %86 : vector<4xf32>
              spv.Store "Function" %41, %85 : vector<4xf32>
              spv.Store "Function" %42, %84 : vector<4xf32>
              %88 = spv.IAdd %74, %4 : i32
              spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            %43 = spv.Load "Function" %42 : vector<4xf32>
            %44 = spv.Load "Function" %41 : vector<4xf32>
            %45 = spv.Load "Function" %40 : vector<4xf32>
            %46 = spv.Load "Function" %39 : vector<4xf32>
            %47 = spv.IAdd %35, %5 : i32
            %48 = spv.IAdd %23, %47 : i32
            %49 = spv.IAdd %24, %32 : i32
            %50 = spv.IAdd %25, %36 : i32
            %51 = spv.SDiv %50, %1 : i32
            %52 = spv.IMul %48, %12 : i32
            %53 = spv.IMul %49, %8 : i32
            %54 = spv.IAdd %52, %53 : i32
            %55 = spv.IAdd %54, %51 : i32
            %56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
            %57 = spv.IAdd %35, %2 : i32
            %58 = spv.IAdd %23, %57 : i32
            %59 = spv.IMul %58, %12 : i32
            %60 = spv.IAdd %59, %53 : i32
            %61 = spv.IAdd %60, %51 : i32
            %62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
            %63 = spv.IAdd %35, %4 : i32
            %64 = spv.IAdd %23, %63 : i32
            %65 = spv.IMul %64, %12 : i32
            %66 = spv.IAdd %65, %53 : i32
            %67 = spv.IAdd %66, %51 : i32
            %68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
            %69 = spv.IAdd %23, %35 : i32
            %70 = spv.IMul %69, %12 : i32
            %71 = spv.IAdd %70, %53 : i32
            %72 = spv.IAdd %71, %51 : i32
            %73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
        }
        hal.interface @legacy_io attributes {sym_visibility = "private"} {
          hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c112_0 = constant 112 : index
    %c32 = constant 32 : index
    %c50331680_i32 = constant 50331680 : i32
    %sz = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c112, %c112_0, %c32]) type(%c50331680_i32) : index
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%sz}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %c112_1 = constant 112 : index
    %c32_2 = constant 32 : index
    %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
    %c0 = constant 0 : index
    %c1_3 = constant 1 : index
    %c225 = constant 225 : index
    %c225_4 = constant 225 : index
    %c3 = constant 3 : index
    %allocator_5 = hal.buffer.allocator<%arg0 : !hal.buffer> : !hal.allocator
    %c50331680_i32_6 = constant 50331680 : i32
    %sz_7 = hal.allocator.compute_size<%allocator_5 : !hal.allocator> shape([%c1_3, %c225, %c225_4, %c3]) type(%c50331680_i32_6) : index
    %c0_8 = constant 0 : index
    %c3_9 = constant 3 : index
    %c3_10 = constant 3 : index
    %c3_11 = constant 3 : index
    %c32_12 = constant 32 : index
    %allocator_13 = hal.buffer.allocator<%arg1 : !hal.buffer> : !hal.allocator
    %c50331680_i32_14 = constant 50331680 : i32
    %sz_15 = hal.allocator.compute_size<%allocator_13 : !hal.allocator> shape([%c3_9, %c3_10, %c3_11, %c32_12]) type(%c50331680_i32_14) : index
    %c1_16 = constant 1 : index
    %c1_17 = constant 1 : index
    %c112_18 = constant 112 : index
    %c112_19 = constant 112 : index
    %c32_20 = constant 32 : index
    %allocator_21 = hal.buffer.allocator<%buffer : !hal.buffer> : !hal.allocator
    %c50331680_i32_22 = constant 50331680 : i32
    %sz_23 = hal.allocator.compute_size<%allocator_21 : !hal.allocator> shape([%c1_17, %c112_18, %c112_19, %c32_20]) type(%c50331680_i32_22) : index
    %c2 = constant 2 : index
    %c0_24 = constant 0 : index
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0_24] bindings([
      %c0_8 = (%arg0 : !hal.buffer)[%c0, %sz_7],
      %c1_16 = (%arg1 : !hal.buffer)[%c0, %sz_15],
      %c2 = (%buffer : !hal.buffer)[%c0, %sz_23]
    ])
    hal.device.switch<%device : !hal.device>
    #hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32_2 : index, %arg4 = %c112_1 : index, %arg5 = %c112_1 : index) {
      %c2_25 = constant 2 : index
      %c28 = constant 28 : index
      %c28_26 = constant 28 : index
      hal.command_buffer.dispatch.symbol<%arg2 : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv::@conv_dispatch_0) workgroups([%c2_25, %c28, %c28_26])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    return %buffer : !hal.buffer
  }
}


// *** IR Dump After mlir::iree_compiler::Shape::(anonymous namespace)::ExpandFunctionRankedShapeDimsPass ***
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %c1 = constant 1 : index
  %c112 = constant 112 : index
  %c112_0 = constant 112 : index
  %c32 = constant 32 : index
  %c50331680_i32 = constant 50331680 : i32
  %sz = hal.allocator.compute_size<%allocator : !hal.allocator> shape([%c1, %c112, %c112_0, %c32]) type(%c50331680_i32) : index
  %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%sz}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %c112_1 = constant 112 : index
  %c32_2 = constant 32 : index
  %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
  %c0 = constant 0 : index
  %c1_3 = constant 1 : index
  %c225 = constant 225 : index
  %c225_4 = constant 225 : index
  %c3 = constant 3 : index
  %allocator_5 = hal.buffer.allocator<%arg0 : !hal.buffer> : !hal.allocator
  %c50331680_i32_6 = constant 50331680 : i32
  %sz_7 = hal.allocator.compute_size<%allocator_5 : !hal.allocator> shape([%c1_3, %c225, %c225_4, %c3]) type(%c50331680_i32_6) : index
  %c0_8 = constant 0 : index
  %c3_9 = constant 3 : index
  %c3_10 = constant 3 : index
  %c3_11 = constant 3 : index
  %c32_12 = constant 32 : index
  %allocator_13 = hal.buffer.allocator<%arg1 : !hal.buffer> : !hal.allocator
  %c50331680_i32_14 = constant 50331680 : i32
  %sz_15 = hal.allocator.compute_size<%allocator_13 : !hal.allocator> shape([%c3_9, %c3_10, %c3_11, %c32_12]) type(%c50331680_i32_14) : index
  %c1_16 = constant 1 : index
  %c1_17 = constant 1 : index
  %c112_18 = constant 112 : index
  %c112_19 = constant 112 : index
  %c32_20 = constant 32 : index
  %allocator_21 = hal.buffer.allocator<%buffer : !hal.buffer> : !hal.allocator
  %c50331680_i32_22 = constant 50331680 : i32
  %sz_23 = hal.allocator.compute_size<%allocator_21 : !hal.allocator> shape([%c1_17, %c112_18, %c112_19, %c32_20]) type(%c50331680_i32_22) : index
  %c2 = constant 2 : index
  %c0_24 = constant 0 : index
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0_24] bindings([
    %c0_8 = (%arg0 : !hal.buffer)[%c0, %sz_7],
    %c1_16 = (%arg1 : !hal.buffer)[%c0, %sz_15],
    %c2 = (%buffer : !hal.buffer)[%c0, %sz_23]
  ])
  hal.device.switch<%device : !hal.device>
    #hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32_2 : index, %arg4 = %c112_1 : index, %arg5 = %c112_1 : index) {
    %c2_25 = constant 2 : index
    %c28 = constant 28 : index
    %c28_26 = constant 28 : index
    hal.command_buffer.dispatch.symbol<%arg2 : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv::@conv_dispatch_0) workgroups([%c2_25, %c28, %c28_26])
    hal.return
  }
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  return %buffer : !hal.buffer
}

// *** IR Dump After Canonicalizer ***
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {iree.module.export, iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %c3456 = constant 3456 : index
  %c607500 = constant 607500 : index
  %c1605632 = constant 1605632 : index
  %c1 = constant 1 : index
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
    %c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
    %c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
    %c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
  ])
  hal.device.switch<%device : !hal.device>
    #hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32 : index, %arg4 = %c112 : index, %arg5 = %c112 : index) {
    %c2_0 = constant 2 : index
    %c28 = constant 28 : index
    hal.command_buffer.dispatch.symbol<%arg2 : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv::@conv_dispatch_0) workgroups([%c2_0, %c28, %c28])
    hal.return
  }
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  return %buffer : !hal.buffer
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::PublicABIGenerationPass ***
module  {
  hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
    hal.interface @legacy_io {
      hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.target @vulkan_spirv, filter="vulkan*" {
      hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c2 = constant 2 : index
        %c28 = constant 28 : index
        %c28_0 = constant 28 : index
        hal.return %c2, %c28, %c28_0 : index, index, index
      }
      module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %1 = spv.Constant 4 : i32
            %2 = spv.Constant 2 : i32
            %3 = spv.Constant 6 : i32
            %4 = spv.Constant 1 : i32
            %5 = spv.Constant 3 : i32
            %6 = spv.Constant 0 : i32
            %7 = spv.Constant 16 : i32
            %8 = spv.Constant 8 : i32
            %9 = spv.Constant 72 : i32
            %10 = spv.Constant 24 : i32
            %11 = spv.Constant 675 : i32
            %12 = spv.Constant 896 : i32
            %13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
            %14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %17 = spv.Load "Input" %16 : vector<3xi32>
            %18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
            %19 = spv.Load "Input" %16 : vector<3xi32>
            %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
            %21 = spv.Load "Input" %16 : vector<3xi32>
            %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
            %23 = spv.IMul %22, %1 : i32
            %24 = spv.IMul %20, %1 : i32
            %25 = spv.IMul %18, %7 : i32
            %26 = spv.IMul %22, %8 : i32
            %27 = spv.IMul %20, %8 : i32
            %28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %29 = spv.Load "Input" %28 : vector<3xi32>
            %30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
            %31 = spv.Load "Input" %28 : vector<3xi32>
            %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
            %33 = spv.Load "Input" %28 : vector<3xi32>
            %34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
            %35 = spv.IMul %34, %1 : i32
            %36 = spv.IMul %30, %1 : i32
            %37 = spv.IMul %34, %8 : i32
            %38 = spv.IMul %32, %2 : i32
            %39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            spv.mlir.loop {
              spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
              %79 = spv.SLessThan %74, %5 : i32
              spv.BranchConditional %79, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %94 = spv.SLessThan %89, %5 : i32
                spv.BranchConditional %94, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %95 = spv.IAdd %25, %36 : i32
                %96 = spv.SDiv %95, %1 : i32
                %97 = spv.IMul %74, %9 : i32
                %98 = spv.IMul %89, %10 : i32
                %99 = spv.IAdd %97, %98 : i32
                %100 = spv.IAdd %99, %96 : i32
                %101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
                %103 = spv.IAdd %99, %8 : i32
                %104 = spv.IAdd %103, %96 : i32
                %105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
                %107 = spv.IAdd %99, %7 : i32
                %108 = spv.IAdd %107, %96 : i32
                %109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
                %111 = spv.IAdd %37, %74 : i32
                %112 = spv.IAdd %38, %89 : i32
                %113 = spv.IAdd %26, %111 : i32
                %114 = spv.IAdd %27, %112 : i32
                %115 = spv.IMul %113, %11 : i32
                %116 = spv.IMul %114, %5 : i32
                %117 = spv.IAdd %115, %116 : i32
                %118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %119 = spv.Load "StorageBuffer" %118 : f32
                %120 = spv.IAdd %117, %4 : i32
                %121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %122 = spv.Load "StorageBuffer" %121 : f32
                %123 = spv.IAdd %117, %2 : i32
                %124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %125 = spv.Load "StorageBuffer" %124 : f32
                %126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
                %127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
                %128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
                %129 = spv.FMul %128, %102 : vector<4xf32>
                %130 = spv.FAdd %129, %90 : vector<4xf32>
                %131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
                %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                %133 = spv.FMul %132, %106 : vector<4xf32>
                %134 = spv.FAdd %133, %130 : vector<4xf32>
                %135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
                %136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
                %137 = spv.FMul %136, %110 : vector<4xf32>
                %138 = spv.FAdd %137, %134 : vector<4xf32>
                %139 = spv.IAdd %74, %2 : i32
                %140 = spv.IAdd %37, %139 : i32
                %141 = spv.IAdd %26, %140 : i32
                %142 = spv.IMul %141, %11 : i32
                %143 = spv.IAdd %142, %116 : i32
                %144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %145 = spv.Load "StorageBuffer" %144 : f32
                %146 = spv.IAdd %143, %4 : i32
                %147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %148 = spv.Load "StorageBuffer" %147 : f32
                %149 = spv.IAdd %143, %2 : i32
                %150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %151 = spv.Load "StorageBuffer" %150 : f32
                %152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
                %153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
                %154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
                %155 = spv.FMul %154, %102 : vector<4xf32>
                %156 = spv.FAdd %155, %91 : vector<4xf32>
                %157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
                %158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
                %159 = spv.FMul %158, %106 : vector<4xf32>
                %160 = spv.FAdd %159, %156 : vector<4xf32>
                %161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
                %162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
                %163 = spv.FMul %162, %110 : vector<4xf32>
                %164 = spv.FAdd %163, %160 : vector<4xf32>
                %165 = spv.IAdd %74, %1 : i32
                %166 = spv.IAdd %37, %165 : i32
                %167 = spv.IAdd %26, %166 : i32
                %168 = spv.IMul %167, %11 : i32
                %169 = spv.IAdd %168, %116 : i32
                %170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %171 = spv.Load "StorageBuffer" %170 : f32
                %172 = spv.IAdd %169, %4 : i32
                %173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %174 = spv.Load "StorageBuffer" %173 : f32
                %175 = spv.IAdd %169, %2 : i32
                %176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %177 = spv.Load "StorageBuffer" %176 : f32
                %178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
                %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
                %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                %181 = spv.FMul %180, %102 : vector<4xf32>
                %182 = spv.FAdd %181, %92 : vector<4xf32>
                %183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
                %184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
                %185 = spv.FMul %184, %106 : vector<4xf32>
                %186 = spv.FAdd %185, %182 : vector<4xf32>
                %187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
                %188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
                %189 = spv.FMul %188, %110 : vector<4xf32>
                %190 = spv.FAdd %189, %186 : vector<4xf32>
                %191 = spv.IAdd %74, %3 : i32
                %192 = spv.IAdd %37, %191 : i32
                %193 = spv.IAdd %26, %192 : i32
                %194 = spv.IMul %193, %11 : i32
                %195 = spv.IAdd %194, %116 : i32
                %196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %197 = spv.Load "StorageBuffer" %196 : f32
                %198 = spv.IAdd %195, %4 : i32
                %199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %200 = spv.Load "StorageBuffer" %199 : f32
                %201 = spv.IAdd %195, %2 : i32
                %202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %203 = spv.Load "StorageBuffer" %202 : f32
                %204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
                %205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
                %206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
                %207 = spv.FMul %206, %102 : vector<4xf32>
                %208 = spv.FAdd %207, %93 : vector<4xf32>
                %209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
                %210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
                %211 = spv.FMul %210, %106 : vector<4xf32>
                %212 = spv.FAdd %211, %208 : vector<4xf32>
                %213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
                %214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
                %215 = spv.FMul %214, %110 : vector<4xf32>
                %216 = spv.FAdd %215, %212 : vector<4xf32>
                spv.Store "Function" %80, %138 : vector<4xf32>
                spv.Store "Function" %81, %164 : vector<4xf32>
                spv.Store "Function" %82, %190 : vector<4xf32>
                spv.Store "Function" %83, %216 : vector<4xf32>
                %217 = spv.IAdd %89, %4 : i32
                spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %84 = spv.Load "Function" %83 : vector<4xf32>
              %85 = spv.Load "Function" %82 : vector<4xf32>
              %86 = spv.Load "Function" %81 : vector<4xf32>
              %87 = spv.Load "Function" %80 : vector<4xf32>
              spv.Store "Function" %39, %87 : vector<4xf32>
              spv.Store "Function" %40, %86 : vector<4xf32>
              spv.Store "Function" %41, %85 : vector<4xf32>
              spv.Store "Function" %42, %84 : vector<4xf32>
              %88 = spv.IAdd %74, %4 : i32
              spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            %43 = spv.Load "Function" %42 : vector<4xf32>
            %44 = spv.Load "Function" %41 : vector<4xf32>
            %45 = spv.Load "Function" %40 : vector<4xf32>
            %46 = spv.Load "Function" %39 : vector<4xf32>
            %47 = spv.IAdd %35, %5 : i32
            %48 = spv.IAdd %23, %47 : i32
            %49 = spv.IAdd %24, %32 : i32
            %50 = spv.IAdd %25, %36 : i32
            %51 = spv.SDiv %50, %1 : i32
            %52 = spv.IMul %48, %12 : i32
            %53 = spv.IMul %49, %8 : i32
            %54 = spv.IAdd %52, %53 : i32
            %55 = spv.IAdd %54, %51 : i32
            %56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
            %57 = spv.IAdd %35, %2 : i32
            %58 = spv.IAdd %23, %57 : i32
            %59 = spv.IMul %58, %12 : i32
            %60 = spv.IAdd %59, %53 : i32
            %61 = spv.IAdd %60, %51 : i32
            %62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
            %63 = spv.IAdd %35, %4 : i32
            %64 = spv.IAdd %23, %63 : i32
            %65 = spv.IMul %64, %12 : i32
            %66 = spv.IAdd %65, %53 : i32
            %67 = spv.IAdd %66, %51 : i32
            %68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
            %69 = spv.IAdd %23, %35 : i32
            %70 = spv.IMul %69, %12 : i32
            %71 = spv.IAdd %70, %53 : i32
            %72 = spv.IAdd %71, %51 : i32
            %73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
        }
        hal.interface @legacy_io attributes {sym_visibility = "private"} {
          hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
    %c3456 = constant 3456 : index
    %c607500 = constant 607500 : index
    %c1605632 = constant 1605632 : index
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
      %c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
      %c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
    ])
    hal.device.switch<%device : !hal.device>
    #hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32 : index, %arg4 = %c112 : index, %arg5 = %c112 : index) {
      %c2_0 = constant 2 : index
      %c28 = constant 28 : index
      hal.command_buffer.dispatch.symbol<%arg2 : !hal.command_buffer> target(@conv_dispatch_0::@vulkan_spirv::@conv_dispatch_0) workgroups([%c2_0, %c28, %c28])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    return %buffer : !hal.buffer
  }
  func @conv$async(%arg0: !hal.semaphore, %arg1: index, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.semaphore, %arg5: index) -> !hal.buffer_view attributes {iree.module.export = "conv$async"} {
    %0 = hal.semaphore.await<%arg0 : !hal.semaphore> until(%arg1) : i32
    hal.check_success %0, "semaphore wait failed"
    %buffer = hal.buffer_view.buffer %arg2 : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer %arg3 : !hal.buffer
    %1 = call @conv(%buffer, %buffer_0) : (!hal.buffer, !hal.buffer) -> !hal.buffer
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c112_1 = constant 112 : index
    %c32 = constant 32 : index
    %c50331680_i32 = constant 50331680 : i32
    %view = hal.buffer_view.create %1, element_type = %c50331680_i32, shape = [%c1, %c112, %c112_1, %c32] : !hal.buffer -> !hal.buffer_view
    hal.semaphore.signal<%arg4 : !hal.semaphore> value(%arg5)
    return %view : !hal.buffer_view
  }
  func @conv$sync(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export = "conv", iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %semaphore = hal.semaphore.create device(%device : !hal.device) initial(%c0) : !hal.semaphore
    %0 = call @conv$async(%semaphore, %c0, %arg0, %arg1, %semaphore, %c1) : (!hal.semaphore, index, !hal.buffer_view, !hal.buffer_view, !hal.semaphore, index) -> !hal.buffer_view
    %1 = hal.semaphore.await<%semaphore : !hal.semaphore> until(%c1) : i32
    hal.check_success %1, "semaphore wait failed"
    return %0 : !hal.buffer_view
  }
}


// *** IR Dump After mlir::iree_compiler::IREE::HAL::ResolveEntryPointOrdinalsPass ***
module  {
  hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
    hal.interface @legacy_io {
      hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.target @vulkan_spirv, filter="vulkan*" {
      hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c2 = constant 2 : index
        %c28 = constant 28 : index
        hal.return %c2, %c28, %c28 : index, index, index
      }
      module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %1 = spv.Constant 4 : i32
            %2 = spv.Constant 2 : i32
            %3 = spv.Constant 6 : i32
            %4 = spv.Constant 1 : i32
            %5 = spv.Constant 3 : i32
            %6 = spv.Constant 0 : i32
            %7 = spv.Constant 16 : i32
            %8 = spv.Constant 8 : i32
            %9 = spv.Constant 72 : i32
            %10 = spv.Constant 24 : i32
            %11 = spv.Constant 675 : i32
            %12 = spv.Constant 896 : i32
            %13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
            %14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %17 = spv.Load "Input" %16 : vector<3xi32>
            %18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
            %19 = spv.Load "Input" %16 : vector<3xi32>
            %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
            %21 = spv.Load "Input" %16 : vector<3xi32>
            %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
            %23 = spv.IMul %22, %1 : i32
            %24 = spv.IMul %20, %1 : i32
            %25 = spv.IMul %18, %7 : i32
            %26 = spv.IMul %22, %8 : i32
            %27 = spv.IMul %20, %8 : i32
            %28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %29 = spv.Load "Input" %28 : vector<3xi32>
            %30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
            %31 = spv.Load "Input" %28 : vector<3xi32>
            %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
            %33 = spv.Load "Input" %28 : vector<3xi32>
            %34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
            %35 = spv.IMul %34, %1 : i32
            %36 = spv.IMul %30, %1 : i32
            %37 = spv.IMul %34, %8 : i32
            %38 = spv.IMul %32, %2 : i32
            %39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            spv.mlir.loop {
              spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
              %79 = spv.SLessThan %74, %5 : i32
              spv.BranchConditional %79, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %94 = spv.SLessThan %89, %5 : i32
                spv.BranchConditional %94, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %95 = spv.IAdd %25, %36 : i32
                %96 = spv.SDiv %95, %1 : i32
                %97 = spv.IMul %74, %9 : i32
                %98 = spv.IMul %89, %10 : i32
                %99 = spv.IAdd %97, %98 : i32
                %100 = spv.IAdd %99, %96 : i32
                %101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
                %103 = spv.IAdd %99, %8 : i32
                %104 = spv.IAdd %103, %96 : i32
                %105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
                %107 = spv.IAdd %99, %7 : i32
                %108 = spv.IAdd %107, %96 : i32
                %109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
                %111 = spv.IAdd %37, %74 : i32
                %112 = spv.IAdd %38, %89 : i32
                %113 = spv.IAdd %26, %111 : i32
                %114 = spv.IAdd %27, %112 : i32
                %115 = spv.IMul %113, %11 : i32
                %116 = spv.IMul %114, %5 : i32
                %117 = spv.IAdd %115, %116 : i32
                %118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %119 = spv.Load "StorageBuffer" %118 : f32
                %120 = spv.IAdd %117, %4 : i32
                %121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %122 = spv.Load "StorageBuffer" %121 : f32
                %123 = spv.IAdd %117, %2 : i32
                %124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %125 = spv.Load "StorageBuffer" %124 : f32
                %126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
                %127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
                %128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
                %129 = spv.FMul %128, %102 : vector<4xf32>
                %130 = spv.FAdd %129, %90 : vector<4xf32>
                %131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
                %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                %133 = spv.FMul %132, %106 : vector<4xf32>
                %134 = spv.FAdd %133, %130 : vector<4xf32>
                %135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
                %136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
                %137 = spv.FMul %136, %110 : vector<4xf32>
                %138 = spv.FAdd %137, %134 : vector<4xf32>
                %139 = spv.IAdd %74, %2 : i32
                %140 = spv.IAdd %37, %139 : i32
                %141 = spv.IAdd %26, %140 : i32
                %142 = spv.IMul %141, %11 : i32
                %143 = spv.IAdd %142, %116 : i32
                %144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %145 = spv.Load "StorageBuffer" %144 : f32
                %146 = spv.IAdd %143, %4 : i32
                %147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %148 = spv.Load "StorageBuffer" %147 : f32
                %149 = spv.IAdd %143, %2 : i32
                %150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %151 = spv.Load "StorageBuffer" %150 : f32
                %152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
                %153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
                %154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
                %155 = spv.FMul %154, %102 : vector<4xf32>
                %156 = spv.FAdd %155, %91 : vector<4xf32>
                %157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
                %158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
                %159 = spv.FMul %158, %106 : vector<4xf32>
                %160 = spv.FAdd %159, %156 : vector<4xf32>
                %161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
                %162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
                %163 = spv.FMul %162, %110 : vector<4xf32>
                %164 = spv.FAdd %163, %160 : vector<4xf32>
                %165 = spv.IAdd %74, %1 : i32
                %166 = spv.IAdd %37, %165 : i32
                %167 = spv.IAdd %26, %166 : i32
                %168 = spv.IMul %167, %11 : i32
                %169 = spv.IAdd %168, %116 : i32
                %170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %171 = spv.Load "StorageBuffer" %170 : f32
                %172 = spv.IAdd %169, %4 : i32
                %173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %174 = spv.Load "StorageBuffer" %173 : f32
                %175 = spv.IAdd %169, %2 : i32
                %176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %177 = spv.Load "StorageBuffer" %176 : f32
                %178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
                %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
                %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                %181 = spv.FMul %180, %102 : vector<4xf32>
                %182 = spv.FAdd %181, %92 : vector<4xf32>
                %183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
                %184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
                %185 = spv.FMul %184, %106 : vector<4xf32>
                %186 = spv.FAdd %185, %182 : vector<4xf32>
                %187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
                %188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
                %189 = spv.FMul %188, %110 : vector<4xf32>
                %190 = spv.FAdd %189, %186 : vector<4xf32>
                %191 = spv.IAdd %74, %3 : i32
                %192 = spv.IAdd %37, %191 : i32
                %193 = spv.IAdd %26, %192 : i32
                %194 = spv.IMul %193, %11 : i32
                %195 = spv.IAdd %194, %116 : i32
                %196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %197 = spv.Load "StorageBuffer" %196 : f32
                %198 = spv.IAdd %195, %4 : i32
                %199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %200 = spv.Load "StorageBuffer" %199 : f32
                %201 = spv.IAdd %195, %2 : i32
                %202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %203 = spv.Load "StorageBuffer" %202 : f32
                %204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
                %205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
                %206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
                %207 = spv.FMul %206, %102 : vector<4xf32>
                %208 = spv.FAdd %207, %93 : vector<4xf32>
                %209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
                %210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
                %211 = spv.FMul %210, %106 : vector<4xf32>
                %212 = spv.FAdd %211, %208 : vector<4xf32>
                %213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
                %214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
                %215 = spv.FMul %214, %110 : vector<4xf32>
                %216 = spv.FAdd %215, %212 : vector<4xf32>
                spv.Store "Function" %80, %138 : vector<4xf32>
                spv.Store "Function" %81, %164 : vector<4xf32>
                spv.Store "Function" %82, %190 : vector<4xf32>
                spv.Store "Function" %83, %216 : vector<4xf32>
                %217 = spv.IAdd %89, %4 : i32
                spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %84 = spv.Load "Function" %83 : vector<4xf32>
              %85 = spv.Load "Function" %82 : vector<4xf32>
              %86 = spv.Load "Function" %81 : vector<4xf32>
              %87 = spv.Load "Function" %80 : vector<4xf32>
              spv.Store "Function" %39, %87 : vector<4xf32>
              spv.Store "Function" %40, %86 : vector<4xf32>
              spv.Store "Function" %41, %85 : vector<4xf32>
              spv.Store "Function" %42, %84 : vector<4xf32>
              %88 = spv.IAdd %74, %4 : i32
              spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            %43 = spv.Load "Function" %42 : vector<4xf32>
            %44 = spv.Load "Function" %41 : vector<4xf32>
            %45 = spv.Load "Function" %40 : vector<4xf32>
            %46 = spv.Load "Function" %39 : vector<4xf32>
            %47 = spv.IAdd %35, %5 : i32
            %48 = spv.IAdd %23, %47 : i32
            %49 = spv.IAdd %24, %32 : i32
            %50 = spv.IAdd %25, %36 : i32
            %51 = spv.SDiv %50, %1 : i32
            %52 = spv.IMul %48, %12 : i32
            %53 = spv.IMul %49, %8 : i32
            %54 = spv.IAdd %52, %53 : i32
            %55 = spv.IAdd %54, %51 : i32
            %56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
            %57 = spv.IAdd %35, %2 : i32
            %58 = spv.IAdd %23, %57 : i32
            %59 = spv.IMul %58, %12 : i32
            %60 = spv.IAdd %59, %53 : i32
            %61 = spv.IAdd %60, %51 : i32
            %62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
            %63 = spv.IAdd %35, %4 : i32
            %64 = spv.IAdd %23, %63 : i32
            %65 = spv.IMul %64, %12 : i32
            %66 = spv.IAdd %65, %53 : i32
            %67 = spv.IAdd %66, %51 : i32
            %68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
            %69 = spv.IAdd %23, %35 : i32
            %70 = spv.IMul %69, %12 : i32
            %71 = spv.IAdd %70, %53 : i32
            %72 = spv.IAdd %71, %51 : i32
            %73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
        }
        hal.interface @legacy_io attributes {sym_visibility = "private"} {
          hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
    %c3456 = constant 3456 : index
    %c607500 = constant 607500 : index
    %c1605632 = constant 1605632 : index
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layouts([[#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]]) : !hal.executable_layout
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
      %c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
      %c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
    ])
    hal.device.switch<%device : !hal.device>
    #hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32 : index, %arg4 = %c112 : index, %arg5 = %c112 : index) {
      %c2_0 = constant 2 : index
      %c28 = constant 28 : index
      %0 = hal.command_buffer.device<%arg2 : !hal.command_buffer> : !hal.device
      %exe = hal.executable.lookup device(%0 : !hal.device) executable(@conv_dispatch_0) : !hal.executable
      hal.command_buffer.dispatch<%arg2 : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c2_0, %c28, %c28])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    return %buffer : !hal.buffer
  }
  func @conv$async(%arg0: !hal.semaphore, %arg1: index, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.semaphore, %arg5: index) -> !hal.buffer_view attributes {iree.module.export = "conv$async"} {
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c50331680_i32 = constant 50331680 : i32
    %0 = hal.semaphore.await<%arg0 : !hal.semaphore> until(%arg1) : i32
    hal.check_success %0, "semaphore wait failed"
    %buffer = hal.buffer_view.buffer %arg2 : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer %arg3 : !hal.buffer
    %1 = call @conv(%buffer, %buffer_0) : (!hal.buffer, !hal.buffer) -> !hal.buffer
    %view = hal.buffer_view.create %1, element_type = %c50331680_i32, shape = [%c1, %c112, %c112, %c32] : !hal.buffer -> !hal.buffer_view
    hal.semaphore.signal<%arg4 : !hal.semaphore> value(%arg5)
    return %view : !hal.buffer_view
  }
  func @conv$sync(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export = "conv", iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %semaphore = hal.semaphore.create device(%device : !hal.device) initial(%c0) : !hal.semaphore
    %0 = call @conv$async(%semaphore, %c0, %arg0, %arg1, %semaphore, %c1) : (!hal.semaphore, index, !hal.buffer_view, !hal.buffer_view, !hal.semaphore, index) -> !hal.buffer_view
    %1 = hal.semaphore.await<%semaphore : !hal.semaphore> until(%c1) : i32
    hal.check_success %1, "semaphore wait failed"
    return %0 : !hal.buffer_view
  }
}


// *** IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeResourceCachesPass ***
module  {
  hal.variable @_descriptor_set_layout_0 init(@_descriptor_set_layout_0_initializer) : !hal.descriptor_set_layout attributes {sym_visibility = "private"}
  func private @_descriptor_set_layout_0_initializer() -> !hal.descriptor_set_layout {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
    return %descriptor_set_layout : !hal.descriptor_set_layout
  }
  hal.variable @_executable_layout_0 init(@_executable_layout_0_initializer) : !hal.executable_layout attributes {sym_visibility = "private"}
  func private @_executable_layout_0_initializer() -> !hal.executable_layout {
    %0 = hal.variable.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%0]) : !hal.executable_layout
    return %executable_layout : !hal.executable_layout
  }
  hal.variable @_executable_conv_dispatch_0 init(@_executable_conv_dispatch_0_initializer) : !hal.executable attributes {sym_visibility = "private"}
  func private @_executable_conv_dispatch_0_initializer() -> !hal.executable {
    %device = hal.ex.shared_device : !hal.device
    %0 = hal.device.switch<%device : !hal.device> -> !hal.executable
    #hal.device.match.id<"vulkan*">(%arg0 = %device : !hal.device) {
      %1 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
      %exe = hal.executable.create device(%arg0 : !hal.device) target(@conv_dispatch_0::@vulkan_spirv) layouts([%1]) : !hal.executable
      hal.return %exe : !hal.executable
    },
    #hal.match.always() {
      %1 = iree.null : !hal.executable
      hal.return %1 : !hal.executable
    }
    return %0 : !hal.executable
  }
  hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
    hal.interface @legacy_io {
      hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.target @vulkan_spirv, filter="vulkan*" {
      hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c2 = constant 2 : index
        %c28 = constant 28 : index
        hal.return %c2, %c28, %c28 : index, index, index
      }
      module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %1 = spv.Constant 4 : i32
            %2 = spv.Constant 2 : i32
            %3 = spv.Constant 6 : i32
            %4 = spv.Constant 1 : i32
            %5 = spv.Constant 3 : i32
            %6 = spv.Constant 0 : i32
            %7 = spv.Constant 16 : i32
            %8 = spv.Constant 8 : i32
            %9 = spv.Constant 72 : i32
            %10 = spv.Constant 24 : i32
            %11 = spv.Constant 675 : i32
            %12 = spv.Constant 896 : i32
            %13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
            %14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %17 = spv.Load "Input" %16 : vector<3xi32>
            %18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
            %19 = spv.Load "Input" %16 : vector<3xi32>
            %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
            %21 = spv.Load "Input" %16 : vector<3xi32>
            %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
            %23 = spv.IMul %22, %1 : i32
            %24 = spv.IMul %20, %1 : i32
            %25 = spv.IMul %18, %7 : i32
            %26 = spv.IMul %22, %8 : i32
            %27 = spv.IMul %20, %8 : i32
            %28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %29 = spv.Load "Input" %28 : vector<3xi32>
            %30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
            %31 = spv.Load "Input" %28 : vector<3xi32>
            %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
            %33 = spv.Load "Input" %28 : vector<3xi32>
            %34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
            %35 = spv.IMul %34, %1 : i32
            %36 = spv.IMul %30, %1 : i32
            %37 = spv.IMul %34, %8 : i32
            %38 = spv.IMul %32, %2 : i32
            %39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            spv.mlir.loop {
              spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
              %79 = spv.SLessThan %74, %5 : i32
              spv.BranchConditional %79, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %94 = spv.SLessThan %89, %5 : i32
                spv.BranchConditional %94, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %95 = spv.IAdd %25, %36 : i32
                %96 = spv.SDiv %95, %1 : i32
                %97 = spv.IMul %74, %9 : i32
                %98 = spv.IMul %89, %10 : i32
                %99 = spv.IAdd %97, %98 : i32
                %100 = spv.IAdd %99, %96 : i32
                %101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
                %103 = spv.IAdd %99, %8 : i32
                %104 = spv.IAdd %103, %96 : i32
                %105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
                %107 = spv.IAdd %99, %7 : i32
                %108 = spv.IAdd %107, %96 : i32
                %109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
                %111 = spv.IAdd %37, %74 : i32
                %112 = spv.IAdd %38, %89 : i32
                %113 = spv.IAdd %26, %111 : i32
                %114 = spv.IAdd %27, %112 : i32
                %115 = spv.IMul %113, %11 : i32
                %116 = spv.IMul %114, %5 : i32
                %117 = spv.IAdd %115, %116 : i32
                %118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %119 = spv.Load "StorageBuffer" %118 : f32
                %120 = spv.IAdd %117, %4 : i32
                %121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %122 = spv.Load "StorageBuffer" %121 : f32
                %123 = spv.IAdd %117, %2 : i32
                %124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %125 = spv.Load "StorageBuffer" %124 : f32
                %126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
                %127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
                %128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
                %129 = spv.FMul %128, %102 : vector<4xf32>
                %130 = spv.FAdd %129, %90 : vector<4xf32>
                %131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
                %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                %133 = spv.FMul %132, %106 : vector<4xf32>
                %134 = spv.FAdd %133, %130 : vector<4xf32>
                %135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
                %136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
                %137 = spv.FMul %136, %110 : vector<4xf32>
                %138 = spv.FAdd %137, %134 : vector<4xf32>
                %139 = spv.IAdd %74, %2 : i32
                %140 = spv.IAdd %37, %139 : i32
                %141 = spv.IAdd %26, %140 : i32
                %142 = spv.IMul %141, %11 : i32
                %143 = spv.IAdd %142, %116 : i32
                %144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %145 = spv.Load "StorageBuffer" %144 : f32
                %146 = spv.IAdd %143, %4 : i32
                %147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %148 = spv.Load "StorageBuffer" %147 : f32
                %149 = spv.IAdd %143, %2 : i32
                %150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %151 = spv.Load "StorageBuffer" %150 : f32
                %152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
                %153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
                %154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
                %155 = spv.FMul %154, %102 : vector<4xf32>
                %156 = spv.FAdd %155, %91 : vector<4xf32>
                %157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
                %158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
                %159 = spv.FMul %158, %106 : vector<4xf32>
                %160 = spv.FAdd %159, %156 : vector<4xf32>
                %161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
                %162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
                %163 = spv.FMul %162, %110 : vector<4xf32>
                %164 = spv.FAdd %163, %160 : vector<4xf32>
                %165 = spv.IAdd %74, %1 : i32
                %166 = spv.IAdd %37, %165 : i32
                %167 = spv.IAdd %26, %166 : i32
                %168 = spv.IMul %167, %11 : i32
                %169 = spv.IAdd %168, %116 : i32
                %170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %171 = spv.Load "StorageBuffer" %170 : f32
                %172 = spv.IAdd %169, %4 : i32
                %173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %174 = spv.Load "StorageBuffer" %173 : f32
                %175 = spv.IAdd %169, %2 : i32
                %176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %177 = spv.Load "StorageBuffer" %176 : f32
                %178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
                %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
                %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                %181 = spv.FMul %180, %102 : vector<4xf32>
                %182 = spv.FAdd %181, %92 : vector<4xf32>
                %183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
                %184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
                %185 = spv.FMul %184, %106 : vector<4xf32>
                %186 = spv.FAdd %185, %182 : vector<4xf32>
                %187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
                %188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
                %189 = spv.FMul %188, %110 : vector<4xf32>
                %190 = spv.FAdd %189, %186 : vector<4xf32>
                %191 = spv.IAdd %74, %3 : i32
                %192 = spv.IAdd %37, %191 : i32
                %193 = spv.IAdd %26, %192 : i32
                %194 = spv.IMul %193, %11 : i32
                %195 = spv.IAdd %194, %116 : i32
                %196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %197 = spv.Load "StorageBuffer" %196 : f32
                %198 = spv.IAdd %195, %4 : i32
                %199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %200 = spv.Load "StorageBuffer" %199 : f32
                %201 = spv.IAdd %195, %2 : i32
                %202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %203 = spv.Load "StorageBuffer" %202 : f32
                %204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
                %205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
                %206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
                %207 = spv.FMul %206, %102 : vector<4xf32>
                %208 = spv.FAdd %207, %93 : vector<4xf32>
                %209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
                %210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
                %211 = spv.FMul %210, %106 : vector<4xf32>
                %212 = spv.FAdd %211, %208 : vector<4xf32>
                %213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
                %214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
                %215 = spv.FMul %214, %110 : vector<4xf32>
                %216 = spv.FAdd %215, %212 : vector<4xf32>
                spv.Store "Function" %80, %138 : vector<4xf32>
                spv.Store "Function" %81, %164 : vector<4xf32>
                spv.Store "Function" %82, %190 : vector<4xf32>
                spv.Store "Function" %83, %216 : vector<4xf32>
                %217 = spv.IAdd %89, %4 : i32
                spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %84 = spv.Load "Function" %83 : vector<4xf32>
              %85 = spv.Load "Function" %82 : vector<4xf32>
              %86 = spv.Load "Function" %81 : vector<4xf32>
              %87 = spv.Load "Function" %80 : vector<4xf32>
              spv.Store "Function" %39, %87 : vector<4xf32>
              spv.Store "Function" %40, %86 : vector<4xf32>
              spv.Store "Function" %41, %85 : vector<4xf32>
              spv.Store "Function" %42, %84 : vector<4xf32>
              %88 = spv.IAdd %74, %4 : i32
              spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            %43 = spv.Load "Function" %42 : vector<4xf32>
            %44 = spv.Load "Function" %41 : vector<4xf32>
            %45 = spv.Load "Function" %40 : vector<4xf32>
            %46 = spv.Load "Function" %39 : vector<4xf32>
            %47 = spv.IAdd %35, %5 : i32
            %48 = spv.IAdd %23, %47 : i32
            %49 = spv.IAdd %24, %32 : i32
            %50 = spv.IAdd %25, %36 : i32
            %51 = spv.SDiv %50, %1 : i32
            %52 = spv.IMul %48, %12 : i32
            %53 = spv.IMul %49, %8 : i32
            %54 = spv.IAdd %52, %53 : i32
            %55 = spv.IAdd %54, %51 : i32
            %56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
            %57 = spv.IAdd %35, %2 : i32
            %58 = spv.IAdd %23, %57 : i32
            %59 = spv.IMul %58, %12 : i32
            %60 = spv.IAdd %59, %53 : i32
            %61 = spv.IAdd %60, %51 : i32
            %62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
            %63 = spv.IAdd %35, %4 : i32
            %64 = spv.IAdd %23, %63 : i32
            %65 = spv.IMul %64, %12 : i32
            %66 = spv.IAdd %65, %53 : i32
            %67 = spv.IAdd %66, %51 : i32
            %68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
            %69 = spv.IAdd %23, %35 : i32
            %70 = spv.IMul %69, %12 : i32
            %71 = spv.IAdd %70, %53 : i32
            %72 = spv.IAdd %71, %51 : i32
            %73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
        }
        hal.interface @legacy_io attributes {sym_visibility = "private"} {
          hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
    %c3456 = constant 3456 : index
    %c607500 = constant 607500 : index
    %c1605632 = constant 1605632 : index
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %0 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
      %c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
    ])
    hal.device.switch<%device : !hal.device>
    #hal.device.match.id<"vulkan*">(%arg2 = %cmd : !hal.command_buffer, %arg3 = %c32 : index, %arg4 = %c112 : index, %arg5 = %c112 : index) {
      %c2_0 = constant 2 : index
      %c28 = constant 28 : index
      %1 = hal.command_buffer.device<%arg2 : !hal.command_buffer> : !hal.device
      %2 = hal.variable.load @_executable_conv_dispatch_0 : !hal.executable
      hal.command_buffer.dispatch<%arg2 : !hal.command_buffer> target(%2 : !hal.executable)[0] workgroups([%c2_0, %c28, %c28])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    return %buffer : !hal.buffer
  }
  func @conv$async(%arg0: !hal.semaphore, %arg1: index, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.semaphore, %arg5: index) -> !hal.buffer_view attributes {iree.module.export = "conv$async"} {
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c50331680_i32 = constant 50331680 : i32
    %0 = hal.semaphore.await<%arg0 : !hal.semaphore> until(%arg1) : i32
    hal.check_success %0, "semaphore wait failed"
    %buffer = hal.buffer_view.buffer %arg2 : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer %arg3 : !hal.buffer
    %1 = call @conv(%buffer, %buffer_0) : (!hal.buffer, !hal.buffer) -> !hal.buffer
    %view = hal.buffer_view.create %1, element_type = %c50331680_i32, shape = [%c1, %c112, %c112, %c32] : !hal.buffer -> !hal.buffer_view
    hal.semaphore.signal<%arg4 : !hal.semaphore> value(%arg5)
    return %view : !hal.buffer_view
  }
  func @conv$sync(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export = "conv", iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %semaphore = hal.semaphore.create device(%device : !hal.device) initial(%c0) : !hal.semaphore
    %0 = call @conv$async(%semaphore, %c0, %arg0, %arg1, %semaphore, %c1) : (!hal.semaphore, index, !hal.buffer_view, !hal.buffer_view, !hal.semaphore, index) -> !hal.buffer_view
    %1 = hal.semaphore.await<%semaphore : !hal.semaphore> until(%c1) : i32
    hal.check_success %1, "semaphore wait failed"
    return %0 : !hal.buffer_view
  }
}


// *** IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass ***
func private @_executable_conv_dispatch_0_initializer() -> !hal.executable {
  %device = hal.ex.shared_device : !hal.device
  %0 = hal.device.match.id<%device : !hal.device> pattern("vulkan*") : i1
  cond_br %0, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %1 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv) layouts([%1]) : !hal.executable
  br ^bb5(%exe : !hal.executable)
^bb2:  // pred: ^bb0
  %true = constant true
  cond_br %true, ^bb3, ^bb4
^bb3:  // pred: ^bb2
  %2 = iree.null : !hal.executable
  br ^bb5(%2 : !hal.executable)
^bb4:  // pred: ^bb2
  iree.unreachable
^bb5(%3: !hal.executable):  // 2 preds: ^bb1, ^bb3
  return %3 : !hal.executable
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass ***
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
  %c3456 = constant 3456 : index
  %c607500 = constant 607500 : index
  %c1605632 = constant 1605632 : index
  %c1 = constant 1 : index
  %c112 = constant 112 : index
  %c32 = constant 32 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %0 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
    %c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
    %c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
  ])
  %1 = hal.device.match.id<%device : !hal.device> pattern("vulkan*") : i1
  cond_br %1, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %c2_0 = constant 2 : index
  %c28 = constant 28 : index
  %2 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
  %3 = hal.variable.load @_executable_conv_dispatch_0 : !hal.executable
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%3 : !hal.executable)[0] workgroups([%c2_0, %c28, %c28])
  br ^bb3
^bb2:  // pred: ^bb0
  iree.unreachable
^bb3:  // pred: ^bb1
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  return %buffer : !hal.buffer
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::MemoizeDeviceQueriesPass ***
module  {
  hal.variable @_device_match_id_0 init(@_device_match_id_0_initializer) : i1 attributes {sym_visibility = "private"}
  func private @_device_match_id_0_initializer() -> i1 {
    %device = hal.ex.shared_device : !hal.device
    %0 = hal.device.match.id<%device : !hal.device> pattern("vulkan*") : i1
    return %0 : i1
  }
  hal.variable @_descriptor_set_layout_0 init(@_descriptor_set_layout_0_initializer) : !hal.descriptor_set_layout attributes {sym_visibility = "private"}
  func private @_descriptor_set_layout_0_initializer() -> !hal.descriptor_set_layout {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(PushOnly) bindings([#hal.descriptor_set_layout_binding<0, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<1, "StorageBuffer", R>, #hal.descriptor_set_layout_binding<2, "StorageBuffer", DW>]) : !hal.descriptor_set_layout
    return %descriptor_set_layout : !hal.descriptor_set_layout
  }
  hal.variable @_executable_layout_0 init(@_executable_layout_0_initializer) : !hal.executable_layout attributes {sym_visibility = "private"}
  func private @_executable_layout_0_initializer() -> !hal.executable_layout {
    %0 = hal.variable.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%0]) : !hal.executable_layout
    return %executable_layout : !hal.executable_layout
  }
  hal.variable @_executable_conv_dispatch_0 init(@_executable_conv_dispatch_0_initializer) : !hal.executable attributes {sym_visibility = "private"}
  func private @_executable_conv_dispatch_0_initializer() -> !hal.executable {
    %device = hal.ex.shared_device : !hal.device
    %0 = hal.variable.load @_device_match_id_0 : i1
    cond_br %0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %1 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv) layouts([%1]) : !hal.executable
    br ^bb5(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %true = constant true
    cond_br %true, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %2 = iree.null : !hal.executable
    br ^bb5(%2 : !hal.executable)
  ^bb4:  // pred: ^bb2
    iree.unreachable
  ^bb5(%3: !hal.executable):  // 2 preds: ^bb1, ^bb3
    return %3 : !hal.executable
  }
  hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
    hal.interface @legacy_io {
      hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
      hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
      hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    }
    hal.executable.target @vulkan_spirv, filter="vulkan*" {
      hal.executable.entry_point @conv_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : index, signature = (!flow.dispatch.tensor<readonly:1x225x225x3xf32>, !flow.dispatch.tensor<readonly:3x3x3x32xf32>, !flow.dispatch.tensor<writeonly:1x112x112x32xf32>) -> ()} {
      ^bb0(%arg0: index, %arg1: index, %arg2: index):  // no predecessors
        %c2 = constant 2 : index
        %c28 = constant 28 : index
        hal.return %c2, %c28, %c28 : index, index, index
      }
      module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader, Float16, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, ARM:IntegratedGPU, {cooperative_matrix_properties_nv = [], max_compute_shared_memory_size = 32768 : i32, max_compute_workgroup_invocations = 512 : i32, max_compute_workgroup_size = dense<512> : vector<3xi32>, subgroup_size = 16 : i32}>}  {
        spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
          spv.GlobalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
          spv.GlobalVariable @__resource_var_204090752__ bind(0, 2) : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204089696__ bind(0, 1) : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
          spv.GlobalVariable @__resource_var_204067744__ bind(0, 0) : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
          spv.func @conv_dispatch_0() "None" {
            %0 = spv.Constant dense<0.000000e+00> : vector<4xf32>
            %1 = spv.Constant 4 : i32
            %2 = spv.Constant 2 : i32
            %3 = spv.Constant 6 : i32
            %4 = spv.Constant 1 : i32
            %5 = spv.Constant 3 : i32
            %6 = spv.Constant 0 : i32
            %7 = spv.Constant 16 : i32
            %8 = spv.Constant 8 : i32
            %9 = spv.Constant 72 : i32
            %10 = spv.Constant 24 : i32
            %11 = spv.Constant 675 : i32
            %12 = spv.Constant 896 : i32
            %13 = spv.mlir.addressof @__resource_var_204067744__ : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>
            %14 = spv.mlir.addressof @__resource_var_204089696__ : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %15 = spv.mlir.addressof @__resource_var_204090752__ : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>
            %16 = spv.mlir.addressof @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
            %17 = spv.Load "Input" %16 : vector<3xi32>
            %18 = spv.CompositeExtract %17[0 : i32] : vector<3xi32>
            %19 = spv.Load "Input" %16 : vector<3xi32>
            %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
            %21 = spv.Load "Input" %16 : vector<3xi32>
            %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
            %23 = spv.IMul %22, %1 : i32
            %24 = spv.IMul %20, %1 : i32
            %25 = spv.IMul %18, %7 : i32
            %26 = spv.IMul %22, %8 : i32
            %27 = spv.IMul %20, %8 : i32
            %28 = spv.mlir.addressof @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %29 = spv.Load "Input" %28 : vector<3xi32>
            %30 = spv.CompositeExtract %29[0 : i32] : vector<3xi32>
            %31 = spv.Load "Input" %28 : vector<3xi32>
            %32 = spv.CompositeExtract %31[1 : i32] : vector<3xi32>
            %33 = spv.Load "Input" %28 : vector<3xi32>
            %34 = spv.CompositeExtract %33[2 : i32] : vector<3xi32>
            %35 = spv.IMul %34, %1 : i32
            %36 = spv.IMul %30, %1 : i32
            %37 = spv.IMul %34, %8 : i32
            %38 = spv.IMul %32, %2 : i32
            %39 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %40 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %41 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            %42 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
            spv.mlir.loop {
              spv.Branch ^bb1(%6, %0, %0, %0, %0 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb1(%74: i32, %75: vector<4xf32>, %76: vector<4xf32>, %77: vector<4xf32>, %78: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
              %79 = spv.SLessThan %74, %5 : i32
              spv.BranchConditional %79, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %80 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %81 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %82 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              %83 = spv.Variable : !spv.ptr<vector<4xf32>, Function>
              spv.mlir.loop {
                spv.Branch ^bb1(%6, %75, %76, %77, %78 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb1(%89: i32, %90: vector<4xf32>, %91: vector<4xf32>, %92: vector<4xf32>, %93: vector<4xf32>):  // 2 preds: ^bb0, ^bb2
                %94 = spv.SLessThan %89, %5 : i32
                spv.BranchConditional %94, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %95 = spv.IAdd %25, %36 : i32
                %96 = spv.SDiv %95, %1 : i32
                %97 = spv.IMul %74, %9 : i32
                %98 = spv.IMul %89, %10 : i32
                %99 = spv.IAdd %97, %98 : i32
                %100 = spv.IAdd %99, %96 : i32
                %101 = spv.AccessChain %14[%6, %100] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %102 = spv.Load "StorageBuffer" %101 : vector<4xf32>
                %103 = spv.IAdd %99, %8 : i32
                %104 = spv.IAdd %103, %96 : i32
                %105 = spv.AccessChain %14[%6, %104] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %106 = spv.Load "StorageBuffer" %105 : vector<4xf32>
                %107 = spv.IAdd %99, %7 : i32
                %108 = spv.IAdd %107, %96 : i32
                %109 = spv.AccessChain %14[%6, %108] : !spv.ptr<!spv.struct<(!spv.array<216 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
                %110 = spv.Load "StorageBuffer" %109 : vector<4xf32>
                %111 = spv.IAdd %37, %74 : i32
                %112 = spv.IAdd %38, %89 : i32
                %113 = spv.IAdd %26, %111 : i32
                %114 = spv.IAdd %27, %112 : i32
                %115 = spv.IMul %113, %11 : i32
                %116 = spv.IMul %114, %5 : i32
                %117 = spv.IAdd %115, %116 : i32
                %118 = spv.AccessChain %13[%6, %117] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %119 = spv.Load "StorageBuffer" %118 : f32
                %120 = spv.IAdd %117, %4 : i32
                %121 = spv.AccessChain %13[%6, %120] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %122 = spv.Load "StorageBuffer" %121 : f32
                %123 = spv.IAdd %117, %2 : i32
                %124 = spv.AccessChain %13[%6, %123] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %125 = spv.Load "StorageBuffer" %124 : f32
                %126 = spv.CompositeConstruct %119, %122, %125 : vector<3xf32>
                %127 = spv.CompositeExtract %126[0 : i32] : vector<3xf32>
                %128 = spv.CompositeConstruct %127, %127, %127, %127 : vector<4xf32>
                %129 = spv.FMul %128, %102 : vector<4xf32>
                %130 = spv.FAdd %129, %90 : vector<4xf32>
                %131 = spv.CompositeExtract %126[1 : i32] : vector<3xf32>
                %132 = spv.CompositeConstruct %131, %131, %131, %131 : vector<4xf32>
                %133 = spv.FMul %132, %106 : vector<4xf32>
                %134 = spv.FAdd %133, %130 : vector<4xf32>
                %135 = spv.CompositeExtract %126[2 : i32] : vector<3xf32>
                %136 = spv.CompositeConstruct %135, %135, %135, %135 : vector<4xf32>
                %137 = spv.FMul %136, %110 : vector<4xf32>
                %138 = spv.FAdd %137, %134 : vector<4xf32>
                %139 = spv.IAdd %74, %2 : i32
                %140 = spv.IAdd %37, %139 : i32
                %141 = spv.IAdd %26, %140 : i32
                %142 = spv.IMul %141, %11 : i32
                %143 = spv.IAdd %142, %116 : i32
                %144 = spv.AccessChain %13[%6, %143] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %145 = spv.Load "StorageBuffer" %144 : f32
                %146 = spv.IAdd %143, %4 : i32
                %147 = spv.AccessChain %13[%6, %146] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %148 = spv.Load "StorageBuffer" %147 : f32
                %149 = spv.IAdd %143, %2 : i32
                %150 = spv.AccessChain %13[%6, %149] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %151 = spv.Load "StorageBuffer" %150 : f32
                %152 = spv.CompositeConstruct %145, %148, %151 : vector<3xf32>
                %153 = spv.CompositeExtract %152[0 : i32] : vector<3xf32>
                %154 = spv.CompositeConstruct %153, %153, %153, %153 : vector<4xf32>
                %155 = spv.FMul %154, %102 : vector<4xf32>
                %156 = spv.FAdd %155, %91 : vector<4xf32>
                %157 = spv.CompositeExtract %152[1 : i32] : vector<3xf32>
                %158 = spv.CompositeConstruct %157, %157, %157, %157 : vector<4xf32>
                %159 = spv.FMul %158, %106 : vector<4xf32>
                %160 = spv.FAdd %159, %156 : vector<4xf32>
                %161 = spv.CompositeExtract %152[2 : i32] : vector<3xf32>
                %162 = spv.CompositeConstruct %161, %161, %161, %161 : vector<4xf32>
                %163 = spv.FMul %162, %110 : vector<4xf32>
                %164 = spv.FAdd %163, %160 : vector<4xf32>
                %165 = spv.IAdd %74, %1 : i32
                %166 = spv.IAdd %37, %165 : i32
                %167 = spv.IAdd %26, %166 : i32
                %168 = spv.IMul %167, %11 : i32
                %169 = spv.IAdd %168, %116 : i32
                %170 = spv.AccessChain %13[%6, %169] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %171 = spv.Load "StorageBuffer" %170 : f32
                %172 = spv.IAdd %169, %4 : i32
                %173 = spv.AccessChain %13[%6, %172] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %174 = spv.Load "StorageBuffer" %173 : f32
                %175 = spv.IAdd %169, %2 : i32
                %176 = spv.AccessChain %13[%6, %175] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %177 = spv.Load "StorageBuffer" %176 : f32
                %178 = spv.CompositeConstruct %171, %174, %177 : vector<3xf32>
                %179 = spv.CompositeExtract %178[0 : i32] : vector<3xf32>
                %180 = spv.CompositeConstruct %179, %179, %179, %179 : vector<4xf32>
                %181 = spv.FMul %180, %102 : vector<4xf32>
                %182 = spv.FAdd %181, %92 : vector<4xf32>
                %183 = spv.CompositeExtract %178[1 : i32] : vector<3xf32>
                %184 = spv.CompositeConstruct %183, %183, %183, %183 : vector<4xf32>
                %185 = spv.FMul %184, %106 : vector<4xf32>
                %186 = spv.FAdd %185, %182 : vector<4xf32>
                %187 = spv.CompositeExtract %178[2 : i32] : vector<3xf32>
                %188 = spv.CompositeConstruct %187, %187, %187, %187 : vector<4xf32>
                %189 = spv.FMul %188, %110 : vector<4xf32>
                %190 = spv.FAdd %189, %186 : vector<4xf32>
                %191 = spv.IAdd %74, %3 : i32
                %192 = spv.IAdd %37, %191 : i32
                %193 = spv.IAdd %26, %192 : i32
                %194 = spv.IMul %193, %11 : i32
                %195 = spv.IAdd %194, %116 : i32
                %196 = spv.AccessChain %13[%6, %195] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %197 = spv.Load "StorageBuffer" %196 : f32
                %198 = spv.IAdd %195, %4 : i32
                %199 = spv.AccessChain %13[%6, %198] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %200 = spv.Load "StorageBuffer" %199 : f32
                %201 = spv.IAdd %195, %2 : i32
                %202 = spv.AccessChain %13[%6, %201] : !spv.ptr<!spv.struct<(!spv.array<151875 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
                %203 = spv.Load "StorageBuffer" %202 : f32
                %204 = spv.CompositeConstruct %197, %200, %203 : vector<3xf32>
                %205 = spv.CompositeExtract %204[0 : i32] : vector<3xf32>
                %206 = spv.CompositeConstruct %205, %205, %205, %205 : vector<4xf32>
                %207 = spv.FMul %206, %102 : vector<4xf32>
                %208 = spv.FAdd %207, %93 : vector<4xf32>
                %209 = spv.CompositeExtract %204[1 : i32] : vector<3xf32>
                %210 = spv.CompositeConstruct %209, %209, %209, %209 : vector<4xf32>
                %211 = spv.FMul %210, %106 : vector<4xf32>
                %212 = spv.FAdd %211, %208 : vector<4xf32>
                %213 = spv.CompositeExtract %204[2 : i32] : vector<3xf32>
                %214 = spv.CompositeConstruct %213, %213, %213, %213 : vector<4xf32>
                %215 = spv.FMul %214, %110 : vector<4xf32>
                %216 = spv.FAdd %215, %212 : vector<4xf32>
                spv.Store "Function" %80, %138 : vector<4xf32>
                spv.Store "Function" %81, %164 : vector<4xf32>
                spv.Store "Function" %82, %190 : vector<4xf32>
                spv.Store "Function" %83, %216 : vector<4xf32>
                %217 = spv.IAdd %89, %4 : i32
                spv.Branch ^bb1(%217, %138, %164, %190, %216 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
              ^bb3:  // pred: ^bb1
                spv.mlir.merge
              }
              %84 = spv.Load "Function" %83 : vector<4xf32>
              %85 = spv.Load "Function" %82 : vector<4xf32>
              %86 = spv.Load "Function" %81 : vector<4xf32>
              %87 = spv.Load "Function" %80 : vector<4xf32>
              spv.Store "Function" %39, %87 : vector<4xf32>
              spv.Store "Function" %40, %86 : vector<4xf32>
              spv.Store "Function" %41, %85 : vector<4xf32>
              spv.Store "Function" %42, %84 : vector<4xf32>
              %88 = spv.IAdd %74, %4 : i32
              spv.Branch ^bb1(%88, %87, %86, %85, %84 : i32, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>)
            ^bb3:  // pred: ^bb1
              spv.mlir.merge
            }
            %43 = spv.Load "Function" %42 : vector<4xf32>
            %44 = spv.Load "Function" %41 : vector<4xf32>
            %45 = spv.Load "Function" %40 : vector<4xf32>
            %46 = spv.Load "Function" %39 : vector<4xf32>
            %47 = spv.IAdd %35, %5 : i32
            %48 = spv.IAdd %23, %47 : i32
            %49 = spv.IAdd %24, %32 : i32
            %50 = spv.IAdd %25, %36 : i32
            %51 = spv.SDiv %50, %1 : i32
            %52 = spv.IMul %48, %12 : i32
            %53 = spv.IMul %49, %8 : i32
            %54 = spv.IAdd %52, %53 : i32
            %55 = spv.IAdd %54, %51 : i32
            %56 = spv.AccessChain %15[%6, %55] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %56, %43 : vector<4xf32>
            %57 = spv.IAdd %35, %2 : i32
            %58 = spv.IAdd %23, %57 : i32
            %59 = spv.IMul %58, %12 : i32
            %60 = spv.IAdd %59, %53 : i32
            %61 = spv.IAdd %60, %51 : i32
            %62 = spv.AccessChain %15[%6, %61] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %62, %44 : vector<4xf32>
            %63 = spv.IAdd %35, %4 : i32
            %64 = spv.IAdd %23, %63 : i32
            %65 = spv.IMul %64, %12 : i32
            %66 = spv.IAdd %65, %53 : i32
            %67 = spv.IAdd %66, %51 : i32
            %68 = spv.AccessChain %15[%6, %67] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %68, %45 : vector<4xf32>
            %69 = spv.IAdd %23, %35 : i32
            %70 = spv.IMul %69, %12 : i32
            %71 = spv.IAdd %70, %53 : i32
            %72 = spv.IAdd %71, %51 : i32
            %73 = spv.AccessChain %15[%6, %72] : !spv.ptr<!spv.struct<(!spv.array<100352 x vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32
            spv.Store "StorageBuffer" %73, %46 : vector<4xf32>
            spv.Return
          }
          spv.EntryPoint "GLCompute" @conv_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
          spv.ExecutionMode @conv_dispatch_0 "LocalSize", 4, 4, 1
        }
        hal.interface @legacy_io attributes {sym_visibility = "private"} {
          hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
    %c3456 = constant 3456 : index
    %c607500 = constant 607500 : index
    %c1605632 = constant 1605632 : index
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %0 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
      %c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
      %c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
    ])
    %1 = hal.variable.load @_device_match_id_0 : i1
    cond_br %1, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %c2_0 = constant 2 : index
    %c28 = constant 28 : index
    %2 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %3 = hal.variable.load @_executable_conv_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%3 : !hal.executable)[0] workgroups([%c2_0, %c28, %c28])
    br ^bb3
  ^bb2:  // pred: ^bb0
    iree.unreachable
  ^bb3:  // pred: ^bb1
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    return %buffer : !hal.buffer
  }
  func @conv$async(%arg0: !hal.semaphore, %arg1: index, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.semaphore, %arg5: index) -> !hal.buffer_view attributes {iree.module.export = "conv$async"} {
    %c1 = constant 1 : index
    %c112 = constant 112 : index
    %c32 = constant 32 : index
    %c50331680_i32 = constant 50331680 : i32
    %0 = hal.semaphore.await<%arg0 : !hal.semaphore> until(%arg1) : i32
    hal.check_success %0, "semaphore wait failed"
    %buffer = hal.buffer_view.buffer %arg2 : !hal.buffer
    %buffer_0 = hal.buffer_view.buffer %arg3 : !hal.buffer
    %1 = call @conv(%buffer, %buffer_0) : (!hal.buffer, !hal.buffer) -> !hal.buffer
    %view = hal.buffer_view.create %1, element_type = %c50331680_i32, shape = [%c1, %c112, %c112, %c32] : !hal.buffer -> !hal.buffer_view
    hal.semaphore.signal<%arg4 : !hal.semaphore> value(%arg5)
    return %view : !hal.buffer_view
  }
  func @conv$sync(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export = "conv", iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %device = hal.ex.shared_device : !hal.device
    %semaphore = hal.semaphore.create device(%device : !hal.device) initial(%c0) : !hal.semaphore
    %0 = call @conv$async(%semaphore, %c0, %arg0, %arg1, %semaphore, %c1) : (!hal.semaphore, index, !hal.buffer_view, !hal.buffer_view, !hal.semaphore, index) -> !hal.buffer_view
    %1 = hal.semaphore.await<%semaphore : !hal.semaphore> until(%c1) : i32
    hal.check_success %1, "semaphore wait failed"
    return %0 : !hal.buffer_view
  }
}


// *** IR Dump After Canonicalizer ***
func private @_executable_conv_dispatch_0_initializer() -> !hal.executable {
  %device = hal.ex.shared_device : !hal.device
  %0 = hal.variable.load @_device_match_id_0 : i1
  cond_br %0, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %1 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@conv_dispatch_0::@vulkan_spirv) layouts([%1]) : !hal.executable
  br ^bb3(%exe : !hal.executable)
^bb2:  // pred: ^bb0
  %2 = iree.null : !hal.executable
  br ^bb3(%2 : !hal.executable)
^bb3(%3: !hal.executable):  // 2 preds: ^bb1, ^bb2
  return %3 : !hal.executable
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::SerializeExecutablesPass ***
hal.executable @conv_dispatch_0 attributes {sym_visibility = "private"} {
  hal.interface @legacy_io {
    hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.binary @vulkan_spirv attributes {data = dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>, format = 1397773893 : i32} {
  }
}

// *** IR Dump After Canonicalizer ***
func @conv(%arg0: !hal.buffer, %arg1: !hal.buffer) -> !hal.buffer attributes {noinline} {
  %c3456 = constant 3456 : index
  %c607500 = constant 607500 : index
  %c1605632 = constant 1605632 : index
  %c1 = constant 1 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %c28 = constant 28 : index
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("Transfer|Mapping|Dispatch") : !hal.buffer{%c1605632}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %0 = hal.variable.load @_executable_layout_0 : !hal.executable_layout
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%arg0 : !hal.buffer)[%c0, %c607500],
    %c1 = (%arg1 : !hal.buffer)[%c0, %c3456],
    %c2 = (%buffer : !hal.buffer)[%c0, %c1605632]
  ])
  %1 = hal.variable.load @_device_match_id_0 : i1
  cond_br %1, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %2 = hal.variable.load @_executable_conv_dispatch_0 : !hal.executable
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%2 : !hal.executable)[0] workgroups([%c2, %c28, %c28])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  return %buffer : !hal.buffer
^bb2:  // pred: ^bb0
  iree.unreachable
}

// *** IR Dump After mlir::iree_compiler::IREE::VM::ConversionPass ***
module  {
  vm.module @module {
    vm.global.i32 @_device_match_id_0 init(@_device_match_id_0_initializer) : i32
    vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
    vm.func private @_device_match_id_0_initializer() -> i32 {
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
      %0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
      vm.return %0 : i32
    }
    vm.global.ref @_descriptor_set_layout_0 init(@_descriptor_set_layout_0_initializer) : !vm.ref<!hal.descriptor_set_layout>
    vm.func private @_descriptor_set_layout_0_initializer() -> !vm.ref<!hal.descriptor_set_layout> {
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %c1 = vm.const.i32 1 : i32
      %zero = vm.const.i32.zero : i32
      %c7 = vm.const.i32 7 : i32
      %c1_0 = vm.const.i32 1 : i32
      %c1_1 = vm.const.i32 1 : i32
      %c7_2 = vm.const.i32 7 : i32
      %c1_3 = vm.const.i32 1 : i32
      %c2 = vm.const.i32 2 : i32
      %c7_4 = vm.const.i32 7 : i32
      %c6 = vm.const.i32 6 : i32
      %ref_5 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7, %c1_0), (%c1_1, %c7_2, %c1_3), (%c2, %c7_4, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      vm.return %ref_5 : !vm.ref<!hal.descriptor_set_layout>
    }
    vm.global.ref @_executable_layout_0 init(@_executable_layout_0_initializer) : !vm.ref<!hal.executable_layout>
    vm.func private @_executable_layout_0_initializer() -> !vm.ref<!hal.executable_layout> {
      %_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %zero = vm.const.i32.zero : i32
      %ref_0 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
      vm.return %ref_0 : !vm.ref<!hal.executable_layout>
    }
    vm.global.ref @_executable_conv_dispatch_0 init(@_executable_conv_dispatch_0_initializer) : !vm.ref<!hal.executable>
    vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
    vm.func private @_executable_conv_dispatch_0_initializer() -> !vm.ref<!hal.executable> {
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
      vm.cond_br %_device_match_id_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %c1397773893 = vm.const.i32 1397773893 : i32
      %_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
      %ref_0 = vm.call.variadic @hal.executable.create(%ref, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
      vm.br ^bb3(%ref_0 : !vm.ref<!hal.executable>)
    ^bb2:  // pred: ^bb0
      %null = vm.const.ref.zero : !vm.ref<!hal.executable>
      vm.br ^bb3(%null : !vm.ref<!hal.executable>)
    ^bb3(%0: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
      vm.return %0 : !vm.ref<!hal.executable>
    }
    vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
      %c3456 = vm.const.i32 3456 : i32
      %c607500 = vm.const.i32 607500 : i32
      %c1605632 = vm.const.i32 1605632 : i32
      %c1 = vm.const.i32 1 : i32
      %zero = vm.const.i32.zero : i32
      %c2 = vm.const.i32 2 : i32
      %c28 = vm.const.i32 28 : i32
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %c50 = vm.const.i32 50 : i32
      %c14 = vm.const.i32 14 : i32
      %ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
      %c1_2 = vm.const.i32 1 : i32
      %c3 = vm.const.i32 3 : i32
      %ref_3 = vm.call @hal.command_buffer.create(%ref, %c1_2, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
      vm.call @hal.command_buffer.begin(%ref_3) : (!vm.ref<!hal.command_buffer>) -> ()
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_3, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
      %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
      vm.cond_br %_device_match_id_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
      %zero_4 = vm.const.i32.zero : i32
      vm.call @hal.command_buffer.dispatch(%ref_3, %_executable_conv_dispatch_0, %zero_4, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
      %c20 = vm.const.i32 20 : i32
      %c5 = vm.const.i32 5 : i32
      %zero_5 = vm.const.i32.zero : i32
      vm.call @hal.command_buffer.execution_barrier(%ref_3, %c20, %c5, %zero_5) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.end(%ref_3) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.call @hal.ex.submit_and_wait(%ref, %ref_3) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
      vm.return %ref_1 : !vm.ref<!hal.buffer>
    ^bb2:  // pred: ^bb0
      %c2_6 = vm.const.i32 2 : i32
      vm.fail %c2_6, "unreachable location reached"
    }
    vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
      %c1 = vm.const.i32 1 : i32
      %c112 = vm.const.i32 112 : i32
      %c32 = vm.const.i32 32 : i32
      %c50331680 = vm.const.i32 50331680 : i32
      %0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_fail %0, "semaphore wait failed"
      %ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
      vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
      vm.return %ref_2 : !vm.ref<!hal.buffer_view>
    }
    vm.export @conv$async
    vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
      %zero = vm.const.i32.zero : i32
      %c1 = vm.const.i32 1 : i32
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
      %ref_1 = vm.call @conv$async(%ref_0, %zero, %arg0, %arg1, %ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.semaphore>, i32) -> !vm.ref<!hal.buffer_view>
      %0 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_fail %0, "semaphore wait failed"
      vm.return %ref_1 : !vm.ref<!hal.buffer_view>
    }
    vm.export @conv$sync as("conv")
    vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.ref<!iree.byte_buffer>, %offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.allocator(%buffer : !vm.ref<!hal.buffer>) -> !vm.ref<!hal.allocator> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.fill(%target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> i32 attributes {sym_visibility = "private"}
    vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.trace(%key : !vm.ref<!iree.byte_buffer>, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i32 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i32) attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
  }
}


// *** IR Dump After mlir::iree_compiler::IREE::VM::GlobalInitializationPass ***
vm.module @module {
  vm.global.i32 @_device_match_id_0 mutable : i32
  vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
  vm.func private @_device_match_id_0_initializer() -> i32 {
    %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
    %0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
    vm.return %0 : i32
  }
  vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
  vm.func private @_descriptor_set_layout_0_initializer() -> !vm.ref<!hal.descriptor_set_layout> {
    %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %c1 = vm.const.i32 1 : i32
    %zero = vm.const.i32.zero : i32
    %c7 = vm.const.i32 7 : i32
    %c1_0 = vm.const.i32 1 : i32
    %c1_1 = vm.const.i32 1 : i32
    %c7_2 = vm.const.i32 7 : i32
    %c1_3 = vm.const.i32 1 : i32
    %c2 = vm.const.i32 2 : i32
    %c7_4 = vm.const.i32 7 : i32
    %c6 = vm.const.i32 6 : i32
    %ref_5 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7, %c1_0), (%c1_1, %c7_2, %c1_3), (%c2, %c7_4, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    vm.return %ref_5 : !vm.ref<!hal.descriptor_set_layout>
  }
  vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
  vm.func private @_executable_layout_0_initializer() -> !vm.ref<!hal.executable_layout> {
    %_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
    %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %zero = vm.const.i32.zero : i32
    %ref_0 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
    vm.return %ref_0 : !vm.ref<!hal.executable_layout>
  }
  vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
  vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
  vm.func private @_executable_conv_dispatch_0_initializer() -> !vm.ref<!hal.executable> {
    %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
    vm.cond_br %_device_match_id_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %c1397773893 = vm.const.i32 1397773893 : i32
    %_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
    %ref_0 = vm.call.variadic @hal.executable.create(%ref, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb3(%ref_0 : !vm.ref<!hal.executable>)
  ^bb2:  // pred: ^bb0
    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
    vm.br ^bb3(%null : !vm.ref<!hal.executable>)
  ^bb3(%0: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
    vm.return %0 : !vm.ref<!hal.executable>
  }
  vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
    %c3456 = vm.const.i32 3456 : i32
    %c607500 = vm.const.i32 607500 : i32
    %c1605632 = vm.const.i32 1605632 : i32
    %c1 = vm.const.i32 1 : i32
    %zero = vm.const.i32.zero : i32
    %c2 = vm.const.i32 2 : i32
    %c28 = vm.const.i32 28 : i32
    %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %c50 = vm.const.i32 50 : i32
    %c14 = vm.const.i32 14 : i32
    %ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
    %c1_2 = vm.const.i32 1 : i32
    %c3 = vm.const.i32 3 : i32
    %ref_3 = vm.call @hal.command_buffer.create(%ref, %c1_2, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
    vm.call @hal.command_buffer.begin(%ref_3) : (!vm.ref<!hal.command_buffer>) -> ()
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_3, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
    %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
    vm.cond_br %_device_match_id_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
    %zero_4 = vm.const.i32.zero : i32
    vm.call @hal.command_buffer.dispatch(%ref_3, %_executable_conv_dispatch_0, %zero_4, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
    %c20 = vm.const.i32 20 : i32
    %c5 = vm.const.i32 5 : i32
    %zero_5 = vm.const.i32.zero : i32
    vm.call @hal.command_buffer.execution_barrier(%ref_3, %c20, %c5, %zero_5) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.end(%ref_3) : (!vm.ref<!hal.command_buffer>) -> ()
    vm.call @hal.ex.submit_and_wait(%ref, %ref_3) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
    vm.return %ref_1 : !vm.ref<!hal.buffer>
  ^bb2:  // pred: ^bb0
    %c2_6 = vm.const.i32 2 : i32
    vm.fail %c2_6, "unreachable location reached"
  }
  vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
    %c1 = vm.const.i32 1 : i32
    %c112 = vm.const.i32 112 : i32
    %c32 = vm.const.i32 32 : i32
    %c50331680 = vm.const.i32 50331680 : i32
    %0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
    vm.cond_fail %0, "semaphore wait failed"
    %ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
    %ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
    vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
    vm.return %ref_2 : !vm.ref<!hal.buffer_view>
  }
  vm.export @conv$async
  vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %zero = vm.const.i32.zero : i32
    %c1 = vm.const.i32 1 : i32
    %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
    %ref_1 = vm.call @conv$async(%ref_0, %zero, %arg0, %arg1, %ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.semaphore>, i32) -> !vm.ref<!hal.buffer_view>
    %0 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
    vm.cond_fail %0, "semaphore wait failed"
    vm.return %ref_1 : !vm.ref<!hal.buffer_view>
  }
  vm.export @conv$sync as("conv")
  vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.ref<!iree.byte_buffer>, %offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.buffer.allocator(%buffer : !vm.ref<!hal.buffer>) -> !vm.ref<!hal.allocator> attributes {sym_visibility = "private"}
  vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.buffer.fill(%target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
  vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> i32 attributes {sym_visibility = "private"}
  vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
  vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.trace(%key : !vm.ref<!iree.byte_buffer>, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i32 ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i32) attributes {sym_visibility = "private"}
  vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
  vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i32) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
  vm.func @__init() {
    %0 = vm.call @_device_match_id_0_initializer() : () -> i32
    vm.global.store.i32 %0, @_device_match_id_0 : i32
    %ref = vm.call @_descriptor_set_layout_0_initializer() : () -> !vm.ref<!hal.descriptor_set_layout>
    vm.global.store.ref %ref, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
    %ref_0 = vm.call @_executable_layout_0_initializer() : () -> !vm.ref<!hal.executable_layout>
    vm.global.store.ref %ref_0, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %ref_1 = vm.call @_executable_conv_dispatch_0_initializer() : () -> !vm.ref<!hal.executable>
    vm.global.store.ref %ref_1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
    vm.return
  }
  vm.export @__init
}

// *** IR Dump After Canonicalizer ***
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %zero = vm.const.i32.zero : i32
  %c1 = vm.const.i32 1 : i32
  %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
  %ref_1 = vm.call @conv$async(%ref_0, %zero, %arg0, %arg1, %ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.buffer_view>, !vm.ref<!hal.semaphore>, i32) -> !vm.ref<!hal.buffer_view>
  %0 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
  vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1:  // pred: ^bb0
  vm.return %ref_1 : !vm.ref<!hal.buffer_view>
^bb2(%1: i32):  // pred: ^bb0
  vm.fail %1, "semaphore wait failed"
}

// *** IR Dump After Canonicalizer ***
vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
  %c1 = vm.const.i32 1 : i32
  %c112 = vm.const.i32 112 : i32
  %c32 = vm.const.i32 32 : i32
  %c50331680 = vm.const.i32 50331680 : i32
  %0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
  vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1:  // pred: ^bb0
  %ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
  %ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
  vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
  vm.return %ref_2 : !vm.ref<!hal.buffer_view>
^bb2(%1: i32):  // pred: ^bb0
  vm.fail %1, "semaphore wait failed"
}

// *** IR Dump After Canonicalizer ***
vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
  %c3456 = vm.const.i32 3456 : i32
  %c607500 = vm.const.i32 607500 : i32
  %c1605632 = vm.const.i32 1605632 : i32
  %c1 = vm.const.i32 1 : i32
  %zero = vm.const.i32.zero : i32
  %c2 = vm.const.i32 2 : i32
  %c28 = vm.const.i32 28 : i32
  %c50 = vm.const.i32 50 : i32
  %c14 = vm.const.i32 14 : i32
  %c3 = vm.const.i32 3 : i32
  %c20 = vm.const.i32 20 : i32
  %c5 = vm.const.i32 5 : i32
  %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
  %ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
  %ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
  vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
  %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
  %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
  vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
  vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
  vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
  vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
  vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
  vm.return %ref_1 : !vm.ref<!hal.buffer>
^bb2:  // pred: ^bb0
  vm.fail %c2, "unreachable location reached"
}

// *** IR Dump After Canonicalizer ***
vm.func private @_executable_conv_dispatch_0_initializer() -> !vm.ref<!hal.executable> {
  %c1397773893 = vm.const.i32 1397773893 : i32
  %null = vm.const.ref.zero : !vm.ref<!hal.executable>
  %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
  vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  %_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
  %ref_0 = vm.call.variadic @hal.executable.create(%ref, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
  vm.br ^bb3(%ref_0 : !vm.ref<!hal.executable>)
^bb2:  // pred: ^bb0
  vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%0: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
  vm.return %0 : !vm.ref<!hal.executable>
}

// *** IR Dump After Canonicalizer ***
vm.func private @_executable_layout_0_initializer() -> !vm.ref<!hal.executable_layout> {
  %zero = vm.const.i32.zero : i32
  %_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
  %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %ref_0 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
  vm.return %ref_0 : !vm.ref<!hal.executable_layout>
}

// *** IR Dump After Canonicalizer ***
vm.func private @_descriptor_set_layout_0_initializer() -> !vm.ref<!hal.descriptor_set_layout> {
  %c1 = vm.const.i32 1 : i32
  %zero = vm.const.i32.zero : i32
  %c7 = vm.const.i32 7 : i32
  %c2 = vm.const.i32 2 : i32
  %c6 = vm.const.i32 6 : i32
  %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %ref_0 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
  vm.return %ref_0 : !vm.ref<!hal.descriptor_set_layout>
}

// *** IR Dump After Canonicalizer ***
vm.func @__init() {
  %c1 = vm.const.i32 1 : i32
  %zero = vm.const.i32.zero : i32
  %c7 = vm.const.i32 7 : i32
  %c2 = vm.const.i32 2 : i32
  %c6 = vm.const.i32 6 : i32
  %c1397773893 = vm.const.i32 1397773893 : i32
  %null = vm.const.ref.zero : !vm.ref<!hal.executable>
  %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
  %0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
  vm.global.store.i32 %0, @_device_match_id_0 : i32
  %ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
  vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
  %_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
  %ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
  vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  %ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
  vm.cond_br %_device_match_id_0, ^bb1, ^bb2
^bb1:  // pred: ^bb0
  %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  %_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
  %ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
  vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
^bb2:  // pred: ^bb0
  vm.br ^bb3(%null : !vm.ref<!hal.executable>)
^bb3(%1: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
  vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
  vm.return
}

// *** IR Dump After Canonicalizer ***
vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
  %zero = vm.const.i32.zero : i32
  %c1 = vm.const.i32 1 : i32
  %c112 = vm.const.i32 112 : i32
  %c32 = vm.const.i32 32 : i32
  %c50331680 = vm.const.i32 50331680 : i32
  %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
  %ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
  %0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
  vm.cond_br %0, ^bb2(%0 : i32), ^bb1
^bb1:  // pred: ^bb0
  %ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
  %ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
  vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
  %1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
  vm.cond_br %1, ^bb2(%1 : i32), ^bb3
^bb2(%2: i32):  // 2 preds: ^bb0, ^bb1
  vm.fail %2, "semaphore wait failed"
^bb3:  // pred: ^bb1
  vm.return %ref_4 : !vm.ref<!hal.buffer_view>
}

// *** IR Dump After Inliner ***
module  {
  vm.module @module {
    vm.global.i32 @_device_match_id_0 mutable : i32
    vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
    vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
    vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
    vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
    vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
    vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
      %c3456 = vm.const.i32 3456 : i32
      %c607500 = vm.const.i32 607500 : i32
      %c1605632 = vm.const.i32 1605632 : i32
      %c1 = vm.const.i32 1 : i32
      %zero = vm.const.i32.zero : i32
      %c2 = vm.const.i32 2 : i32
      %c28 = vm.const.i32 28 : i32
      %c50 = vm.const.i32 50 : i32
      %c14 = vm.const.i32 14 : i32
      %c3 = vm.const.i32 3 : i32
      %c20 = vm.const.i32 20 : i32
      %c5 = vm.const.i32 5 : i32
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
      vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
      %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
      vm.cond_br %_device_match_id_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
      vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
      vm.return %ref_1 : !vm.ref<!hal.buffer>
    ^bb2:  // pred: ^bb0
      vm.fail %c2, "unreachable location reached"
    }
    vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
      %c1 = vm.const.i32 1 : i32
      %c112 = vm.const.i32 112 : i32
      %c32 = vm.const.i32 32 : i32
      %c50331680 = vm.const.i32 50331680 : i32
      %0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
      vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
      vm.return %ref_2 : !vm.ref<!hal.buffer_view>
    ^bb2(%1: i32):  // pred: ^bb0
      vm.fail %1, "semaphore wait failed"
    }
    vm.export @conv$async
    vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
      %zero = vm.const.i32.zero : i32
      %c1 = vm.const.i32 1 : i32
      %c112 = vm.const.i32 112 : i32
      %c32 = vm.const.i32 32 : i32
      %c50331680 = vm.const.i32 50331680 : i32
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
      %0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
      %ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
      vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
      %1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_br %1, ^bb2(%1 : i32), ^bb3
    ^bb2(%2: i32):  // 2 preds: ^bb0, ^bb1
      vm.fail %2, "semaphore wait failed"
    ^bb3:  // pred: ^bb1
      vm.return %ref_4 : !vm.ref<!hal.buffer_view>
    }
    vm.export @conv$sync as("conv")
    vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.ref<!iree.byte_buffer>, %offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.allocator(%buffer : !vm.ref<!hal.buffer>) -> !vm.ref<!hal.allocator> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.fill(%target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %length : i32) -> i32 attributes {sym_visibility = "private"}
    vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.trace(%key : !vm.ref<!iree.byte_buffer>, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32, %pattern : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i32, %length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i32 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i32) attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
    vm.func @__init() {
      %c1 = vm.const.i32 1 : i32
      %zero = vm.const.i32.zero : i32
      %c7 = vm.const.i32 7 : i32
      %c2 = vm.const.i32 2 : i32
      %c6 = vm.const.i32 6 : i32
      %c1397773893 = vm.const.i32 1397773893 : i32
      %null = vm.const.ref.zero : !vm.ref<!hal.executable>
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
      %0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
      vm.global.store.i32 %0, @_device_match_id_0 : i32
      %ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
      %_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
      %ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
      vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
      vm.cond_br %_device_match_id_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
      %ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
      vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
    ^bb2:  // pred: ^bb0
      vm.br ^bb3(%null : !vm.ref<!hal.executable>)
    ^bb3(%1: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
      vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
      vm.return
    }
    vm.export @__init
  }
}


// *** IR Dump After SymbolDCE ***
module  {
  vm.module @module {
    vm.global.i32 @_device_match_id_0 mutable : i32
    vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
    vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
    vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
    vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
    vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
    vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
      %c3456 = vm.const.i32 3456 : i32
      %c607500 = vm.const.i32 607500 : i32
      %c1605632 = vm.const.i32 1605632 : i32
      %c1 = vm.const.i32 1 : i32
      %zero = vm.const.i32.zero : i32
      %c2 = vm.const.i32 2 : i32
      %c28 = vm.const.i32 28 : i32
      %c50 = vm.const.i32 50 : i32
      %c14 = vm.const.i32 14 : i32
      %c3 = vm.const.i32 3 : i32
      %c20 = vm.const.i32 20 : i32
      %c5 = vm.const.i32 5 : i32
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
      vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
      %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
      vm.cond_br %_device_match_id_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
      vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
      vm.return %ref_1 : !vm.ref<!hal.buffer>
    ^bb2:  // pred: ^bb0
      vm.fail %c2, "unreachable location reached"
    }
    vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
      %c1 = vm.const.i32 1 : i32
      %c112 = vm.const.i32 112 : i32
      %c32 = vm.const.i32 32 : i32
      %c50331680 = vm.const.i32 50331680 : i32
      %0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
      vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
      vm.return %ref_2 : !vm.ref<!hal.buffer_view>
    ^bb2(%1: i32):  // pred: ^bb0
      vm.fail %1, "semaphore wait failed"
    }
    vm.export @conv$async
    vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
      %zero = vm.const.i32.zero : i32
      %c1 = vm.const.i32 1 : i32
      %c112 = vm.const.i32 112 : i32
      %c32 = vm.const.i32 32 : i32
      %c50331680 = vm.const.i32 50331680 : i32
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
      %0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
      %ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
      vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
      %1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_br %1, ^bb2(%1 : i32), ^bb3
    ^bb2(%2: i32):  // 2 preds: ^bb0, ^bb1
      vm.fail %2, "semaphore wait failed"
    ^bb3:  // pred: ^bb1
      vm.return %ref_4 : !vm.ref<!hal.buffer_view>
    }
    vm.export @conv$sync as("conv")
    vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
    vm.func @__init() {
      %c1 = vm.const.i32 1 : i32
      %zero = vm.const.i32.zero : i32
      %c7 = vm.const.i32 7 : i32
      %c2 = vm.const.i32 2 : i32
      %c6 = vm.const.i32 6 : i32
      %c1397773893 = vm.const.i32 1397773893 : i32
      %null = vm.const.ref.zero : !vm.ref<!hal.executable>
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
      %0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
      vm.global.store.i32 %0, @_device_match_id_0 : i32
      %ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
      %_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
      %ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
      vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
      vm.cond_br %_device_match_id_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
      %ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
      vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
    ^bb2:  // pred: ^bb0
      vm.br ^bb3(%null : !vm.ref<!hal.executable>)
    ^bb3(%1: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
      vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
      vm.return
    }
    vm.export @__init
  }
}


// *** IR Dump After mlir::iree_compiler::IREE::VM::SinkDefiningOpsPass ***
vm.module @module {
  vm.global.i32 @_device_match_id_0 mutable : i32
  vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
  vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
  vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
  vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
  vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
  vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
    %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %c1605632 = vm.const.i32 1605632 : i32
    %c50 = vm.const.i32 50 : i32
    %c14 = vm.const.i32 14 : i32
    %ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
    %c1 = vm.const.i32 1 : i32
    %c3 = vm.const.i32 3 : i32
    %ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
    vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %c3456 = vm.const.i32 3456 : i32
    %c607500 = vm.const.i32 607500 : i32
    %zero = vm.const.i32.zero : i32
    %c2 = vm.const.i32 2 : i32
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
    %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
    vm.cond_br %_device_match_id_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
    %c28 = vm.const.i32 28 : i32
    vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
    %c20 = vm.const.i32 20 : i32
    %c5 = vm.const.i32 5 : i32
    vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
    vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
    vm.return %ref_1 : !vm.ref<!hal.buffer>
  ^bb2:  // pred: ^bb0
    vm.fail %c2, "unreachable location reached"
  }
  vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
    %0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
    vm.cond_br %0, ^bb2(%0 : i32), ^bb1
  ^bb1:  // pred: ^bb0
    %ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
    %c1 = vm.const.i32 1 : i32
    %c112 = vm.const.i32 112 : i32
    %c32 = vm.const.i32 32 : i32
    %c50331680 = vm.const.i32 50331680 : i32
    %ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
    vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
    vm.return %ref_2 : !vm.ref<!hal.buffer_view>
  ^bb2(%1: i32):  // pred: ^bb0
    vm.fail %1, "semaphore wait failed"
  }
  vm.export @conv$async
  vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
    %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %zero = vm.const.i32.zero : i32
    %ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
    %0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
    vm.cond_br %0, ^bb2(%0 : i32), ^bb1
  ^bb1:  // pred: ^bb0
    %ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
    %c1 = vm.const.i32 1 : i32
    %c112 = vm.const.i32 112 : i32
    %c32 = vm.const.i32 32 : i32
    %c50331680 = vm.const.i32 50331680 : i32
    %ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
    vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
    %1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
    vm.cond_br %1, ^bb2(%1 : i32), ^bb3
  ^bb2(%2: i32):  // 2 preds: ^bb0, ^bb1
    vm.fail %2, "semaphore wait failed"
  ^bb3:  // pred: ^bb1
    vm.return %ref_4 : !vm.ref<!hal.buffer_view>
  }
  vm.export @conv$sync as("conv")
  vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
  vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
  vm.func @__init() {
    %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
    %0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
    vm.global.store.i32 %0, @_device_match_id_0 : i32
    %ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %c1 = vm.const.i32 1 : i32
    %zero = vm.const.i32.zero : i32
    %c7 = vm.const.i32 7 : i32
    %c2 = vm.const.i32 2 : i32
    %c6 = vm.const.i32 6 : i32
    %ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
    %_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
    %ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
    vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
    %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
    vm.cond_br %_device_match_id_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
    %c1397773893 = vm.const.i32 1397773893 : i32
    %ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
  ^bb2:  // pred: ^bb0
    %null = vm.const.ref.zero : !vm.ref<!hal.executable>
    vm.br ^bb3(%null : !vm.ref<!hal.executable>)
  ^bb3(%1: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
    vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
    vm.return
  }
  vm.export @__init
}

module  {
  vm.module @module {
    vm.global.i32 @_device_match_id_0 mutable : i32
    vm.rodata @_utf8_vulkan_7197BF52A22CAFD7 dense<[118, 117, 108, 107, 97, 110, 42]> : vector<7xi8>
    vm.global.ref @_descriptor_set_layout_0 mutable : !vm.ref<!hal.descriptor_set_layout>
    vm.global.ref @_executable_layout_0 mutable : !vm.ref<!hal.executable_layout>
    vm.global.ref @_executable_conv_dispatch_0 mutable : !vm.ref<!hal.executable>
    vm.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv dense<"0x08000000535056456CE9FFFF080000002000000001000000040000000F000000636F6E765F64697370617463685F30009A050000030223070000010016000000FC0000000000000011000200010000000A000B005350565F4B48525F73746F726167655F6275666665725F73746F726167655F636C617373000000000E00030000000000010000000F0009000500000019000000636F6E765F64697370617463685F3000050000000400000010000600190000001100000004000000040000000100000005000B00040000005F5F6275696C74696E5F7661725F4C6F63616C496E766F636174696F6E49645F5F00000005000900050000005F5F6275696C74696E5F7661725F576F726B67726F757049645F5F00050009000C0000005F5F7265736F757263655F7661725F3230343039303735325F5F000005000900110000005F5F7265736F757263655F7661725F3230343038393639365F5F000005000900160000005F5F7265736F757263655F7661725F3230343036373734345F5F00000500060019000000636F6E765F64697370617463685F300047000400040000000B0000001B00000047000400050000000B0000001A000000470004000800000006000000100000004800050007000000000000002300000000000000470003000700000002000000470004000C0000002100000002000000470004000C0000002200000000000000470004000F0000000600000010000000480005000E000000000000002300000000000000470003000E00000002000000470004001100000021000000010000004700040011000000220000000000000047000400140000000600000004000000480005001300000000000000230000000000000047000300130000000200000047000400160000002100000000000000470004001600000022000000000000001500040003000000200000000000000017000400020000000300000003000000200004000100000001000000020000003B0004000100000004000000010000003B000400010000000500000001000000160003000A0000002000000017000400090000000A000000040000002B000400030000000B000000008801001C00040008000000090000000B0000001E000300070000000800000020000400060000000C000000070000003B000400060000000C0000000C0000002B0004000300000010000000D80000001C0004000F00000009000000100000001E0003000E0000000F000000200004000D0000000C0000000E0000003B0004000D000000110000000C0000002B0004000300000015000000435102001C000400140000000A000000150000001E000300130000001400000020000400120000000C000000130000003B00040012000000160000000C00000013000200180000002100030017000000180000002B0004000A0000001C000000000000002C000700090000001B0000001C0000001C0000001C0000001C0000002B000400030000001D000000040000002B000400030000001E000000020000002B000400030000001F000000060000002B0004000300000020000000010000002B0004000300000021000000030000002B0004000300000022000000000000002B0004000300000023000000100000002B0004000300000024000000080000002B0004000300000025000000480000002B0004000300000026000000180000002B0004000300000027000000A30200002B000400030000002800000080030000200004003E0000000700000009000000140002004C00000020000400600000000C0000000900000020000400720000000C0000000A000000170004007B0000000A000000030000003600050018000000190000000000000017000000F80002001A0000003B0004003E0000003F000000070000003B0004003E00000040000000070000003B0004003E00000041000000070000003B0004003E00000042000000070000003B0004003E0000004E000000070000003B0004003E0000004F000000070000003B0004003E00000050000000070000003B0004003E00000051000000070000003D00040002000000290000000500000051000500030000002A00000029000000000000003D000400020000002B0000000500000051000500030000002C0000002B000000010000003D000400020000002D0000000500000051000500030000002E0000002D0000000200000084000500030000002F0000002E0000001D0000008400050003000000300000002C0000001D0000008400050003000000310000002A000000230000008400050003000000320000002E000000240000008400050003000000330000002C000000240000003D00040002000000340000000400000051000500030000003500000034000000000000003D00040002000000360000000400000051000500030000003700000036000000010000003D000400020000003800000004000000510005000300000039000000380000000200000084000500030000003A000000390000001D00000084000500030000003B000000350000001D00000084000500030000003C000000390000002400000084000500030000003D000000370000001E000000F900020043000000F800020043000000F50007000300000046000000DC00000047000000220000001A000000F50007000900000048000000DB000000470000001B0000001A000000F50007000900000049000000DA000000470000001B0000001A000000F5000700090000004A000000D9000000470000001B0000001A000000F5000700090000004B000000D8000000470000001B0000001A000000B10005004C0000004D0000004600000021000000F6000400450000004400000000000000FA0004004D0000004400000045000000F800020044000000F900020052000000F800020052000000F50007000300000054000000D7000000530000002200000044000000F5000700090000005500000088000000530000004800000044000000F50007000900000056000000A2000000530000004900000044000000F50007000900000057000000BC000000530000004A00000044000000F50007000900000058000000D6000000530000004B00000044000000B10005004C000000590000005400000021000000F6000400470000005300000000000000FA000400590000005300000047000000F80002005300000080000500030000005A000000310000003B00000087000500030000005B0000005A0000001D00000084000500030000005C000000460000002500000084000500030000005D000000540000002600000080000500030000005E0000005C0000005D00000080000500030000005F0000005E0000005B00000041000600600000006100000011000000220000005F0000003D0004000900000062000000610000008000050003000000630000005E00000024000000800005000300000064000000630000005B0000004100060060000000650000001100000022000000640000003D0004000900000066000000650000008000050003000000670000005E00000023000000800005000300000068000000670000005B0000004100060060000000690000001100000022000000680000003D000400090000006A0000006900000080000500030000006B0000003C0000004600000080000500030000006C0000003D0000005400000080000500030000006D000000320000006B00000080000500030000006E000000330000006C00000084000500030000006F0000006D000000270000008400050003000000700000006E000000210000008000050003000000710000006F000000700000004100060072000000730000001600000022000000710000003D0004000A000000740000007300000080000500030000007500000071000000200000004100060072000000760000001600000022000000750000003D0004000A0000007700000076000000800005000300000078000000710000001E0000004100060072000000790000001600000022000000780000003D0004000A0000007A00000079000000500006007B0000007C00000074000000770000007A000000510005000A0000007D0000007C0000000000000050000700090000007E0000007D0000007D0000007D0000007D00000085000500090000007F0000007E000000620000008100050009000000800000007F00000055000000510005000A000000810000007C000000010000005000070009000000820000008100000081000000810000008100000085000500090000008300000082000000660000008100050009000000840000008300000080000000510005000A000000850000007C0000000200000050000700090000008600000085000000850000008500000085000000850005000900000087000000860000006A0000008100050009000000880000008700000084000000800005000300000089000000460000001E00000080000500030000008A0000003C0000008900000080000500030000008B000000320000008A00000084000500030000008C0000008B0000002700000080000500030000008D0000008C0000007000000041000600720000008E00000016000000220000008D0000003D0004000A0000008F0000008E0000008000050003000000900000008D000000200000004100060072000000910000001600000022000000900000003D0004000A00000092000000910000008000050003000000930000008D0000001E0000004100060072000000940000001600000022000000930000003D0004000A0000009500000094000000500006007B000000960000008F0000009200000095000000510005000A00000097000000960000000000000050000700090000009800000097000000970000009700000097000000850005000900000099000000980000006200000081000500090000009A0000009900000056000000510005000A0000009B000000960000000100000050000700090000009C0000009B0000009B0000009B0000009B00000085000500090000009D0000009C0000006600000081000500090000009E0000009D0000009A000000510005000A0000009F00000096000000020000005000070009000000A00000009F0000009F0000009F0000009F0000008500050009000000A1000000A00000006A0000008100050009000000A2000000A10000009E0000008000050003000000A3000000460000001D0000008000050003000000A40000003C000000A30000008000050003000000A500000032000000A40000008400050003000000A6000000A5000000270000008000050003000000A7000000A6000000700000004100060072000000A80000001600000022000000A70000003D0004000A000000A9000000A80000008000050003000000AA000000A7000000200000004100060072000000AB0000001600000022000000AA0000003D0004000A000000AC000000AB0000008000050003000000AD000000A70000001E0000004100060072000000AE0000001600000022000000AD0000003D0004000A000000AF000000AE000000500006007B000000B0000000A9000000AC000000AF000000510005000A000000B1000000B0000000000000005000070009000000B2000000B1000000B1000000B1000000B10000008500050009000000B3000000B2000000620000008100050009000000B4000000B300000057000000510005000A000000B5000000B0000000010000005000070009000000B6000000B5000000B5000000B5000000B50000008500050009000000B7000000B6000000660000008100050009000000B8000000B7000000B4000000510005000A000000B9000000B0000000020000005000070009000000BA000000B9000000B9000000B9000000B90000008500050009000000BB000000BA0000006A0000008100050009000000BC000000BB000000B80000008000050003000000BD000000460000001F0000008000050003000000BE0000003C000000BD0000008000050003000000BF00000032000000BE0000008400050003000000C0000000BF000000270000008000050003000000C1000000C0000000700000004100060072000000C20000001600000022000000C10000003D0004000A000000C3000000C20000008000050003000000C4000000C1000000200000004100060072000000C50000001600000022000000C40000003D0004000A000000C6000000C50000008000050003000000C7000000C10000001E0000004100060072000000C80000001600000022000000C70000003D0004000A000000C9000000C8000000500006007B000000CA000000C3000000C6000000C9000000510005000A000000CB000000CA000000000000005000070009000000CC000000CB000000CB000000CB000000CB0000008500050009000000CD000000CC000000620000008100050009000000CE000000CD00000058000000510005000A000000CF000000CA000000010000005000070009000000D0000000CF000000CF000000CF000000CF0000008500050009000000D1000000D0000000660000008100050009000000D2000000D1000000CE000000510005000A000000D3000000CA000000020000005000070009000000D4000000D3000000D3000000D3000000D30000008500050009000000D5000000D40000006A0000008100050009000000D6000000D5000000D20000003E0003004E000000880000003E0003004F000000A20000003E00030050000000BC0000003E00030051000000D60000008000050003000000D70000005400000020000000F900020052000000F8000200470000003D00040009000000D8000000510000003D00040009000000D9000000500000003D00040009000000DA0000004F0000003D00040009000000DB0000004E0000003E0003003F000000DB0000003E00030040000000DA0000003E00030041000000D90000003E00030042000000D80000008000050003000000DC0000004600000020000000F900020043000000F8000200450000003D00040009000000DD000000420000003D00040009000000DE000000410000003D00040009000000DF000000400000003D00040009000000E00000003F0000008000050003000000E10000003A000000210000008000050003000000E20000002F000000E10000008000050003000000E300000030000000370000008000050003000000E4000000310000003B0000008700050003000000E5000000E40000001D0000008400050003000000E6000000E2000000280000008400050003000000E7000000E3000000240000008000050003000000E8000000E6000000E70000008000050003000000E9000000E8000000E50000004100060060000000EA0000000C00000022000000E90000003E000300EA000000DD0000008000050003000000EB0000003A0000001E0000008000050003000000EC0000002F000000EB0000008400050003000000ED000000EC000000280000008000050003000000EE000000ED000000E70000008000050003000000EF000000EE000000E50000004100060060000000F00000000C00000022000000EF0000003E000300F0000000DE0000008000050003000000F10000003A000000200000008000050003000000F20000002F000000F10000008400050003000000F3000000F2000000280000008000050003000000F4000000F3000000E70000008000050003000000F5000000F4000000E50000004100060060000000F60000000C00000022000000F50000003E000300F6000000DF0000008000050003000000F70000002F0000003A0000008400050003000000F8000000F7000000280000008000050003000000F9000000F8000000E70000008000050003000000FA000000F9000000E50000004100060060000000FB0000000C00000022000000FA0000003E000300FB000000E0000000FD0001003800010008000C0004000800"> : vector<5796xi8>
    vm.func @conv(%arg0: !vm.ref<!hal.buffer>, %arg1: !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer> attributes {noinline} {
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_0 = vm.call @hal.device.allocator(%ref) : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %c1605632 = vm.const.i32 1605632 : i32
      %c50 = vm.const.i32 50 : i32
      %c14 = vm.const.i32 14 : i32
      %ref_1 = vm.call @hal.allocator.allocate(%ref_0, %c50, %c14, %c1605632) : (!vm.ref<!hal.allocator>, i32, i32, i32) -> !vm.ref<!hal.buffer>
      %c1 = vm.const.i32 1 : i32
      %c3 = vm.const.i32 3 : i32
      %ref_2 = vm.call @hal.command_buffer.create(%ref, %c1, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
      vm.call @hal.command_buffer.begin(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %c3456 = vm.const.i32 3456 : i32
      %c607500 = vm.const.i32 607500 : i32
      %zero = vm.const.i32.zero : i32
      %c2 = vm.const.i32 2 : i32
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_2, %_executable_layout_0, %zero, [(%zero, %arg0, %zero, %c607500), (%c1, %arg1, %zero, %c3456), (%c2, %ref_1, %zero, %c1605632)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...)
      %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
      vm.cond_br %_device_match_id_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_conv_dispatch_0 = vm.global.load.ref @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
      %c28 = vm.const.i32 28 : i32
      vm.call @hal.command_buffer.dispatch(%ref_2, %_executable_conv_dispatch_0, %zero, %c2, %c28, %c28) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
      %c20 = vm.const.i32 20 : i32
      %c5 = vm.const.i32 5 : i32
      vm.call @hal.command_buffer.execution_barrier(%ref_2, %c20, %c5, %zero) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.end(%ref_2) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.call @hal.ex.submit_and_wait(%ref, %ref_2) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
      vm.return %ref_1 : !vm.ref<!hal.buffer>
    ^bb2:  // pred: ^bb0
      vm.fail %c2, "unreachable location reached"
    }
    vm.func @conv$async(%arg0: !vm.ref<!hal.semaphore>, %arg1: i32, %arg2: !vm.ref<!hal.buffer_view>, %arg3: !vm.ref<!hal.buffer_view>, %arg4: !vm.ref<!hal.semaphore>, %arg5: i32) -> !vm.ref<!hal.buffer_view> {
      %0 = vm.call @hal.semaphore.await(%arg0, %arg1) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref = vm.call @hal.buffer_view.buffer(%arg2) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_0 = vm.call @hal.buffer_view.buffer(%arg3) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_1 = vm.call @conv(%ref, %ref_0) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
      %c1 = vm.const.i32 1 : i32
      %c112 = vm.const.i32 112 : i32
      %c32 = vm.const.i32 32 : i32
      %c50331680 = vm.const.i32 50331680 : i32
      %ref_2 = vm.call.variadic @hal.buffer_view.create(%ref_1, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
      vm.call @hal.semaphore.signal(%arg4, %arg5) : (!vm.ref<!hal.semaphore>, i32) -> ()
      vm.return %ref_2 : !vm.ref<!hal.buffer_view>
    ^bb2(%1: i32):  // pred: ^bb0
      vm.fail %1, "semaphore wait failed"
    }
    vm.export @conv$async
    vm.func @conv$sync(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> attributes {iree.reflection = {f = "I30!B13!d1d225d225d3B10!d3d3d3d32R18!B14!d1d112d112d32", fv = "1"}} {
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %zero = vm.const.i32.zero : i32
      %ref_0 = vm.call @hal.semaphore.create(%ref, %zero) : (!vm.ref<!hal.device>, i32) -> !vm.ref<!hal.semaphore>
      %0 = vm.call @hal.semaphore.await(%ref_0, %zero) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_br %0, ^bb2(%0 : i32), ^bb1
    ^bb1:  // pred: ^bb0
      %ref_1 = vm.call @hal.buffer_view.buffer(%arg0) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call @hal.buffer_view.buffer(%arg1) : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_3 = vm.call @conv(%ref_1, %ref_2) : (!vm.ref<!hal.buffer>, !vm.ref<!hal.buffer>) -> !vm.ref<!hal.buffer>
      %c1 = vm.const.i32 1 : i32
      %c112 = vm.const.i32 112 : i32
      %c32 = vm.const.i32 32 : i32
      %c50331680 = vm.const.i32 50331680 : i32
      %ref_4 = vm.call.variadic @hal.buffer_view.create(%ref_3, %c50331680, [%c1, %c112, %c112, %c32]) : (!vm.ref<!hal.buffer>, i32, i32 ...) -> !vm.ref<!hal.buffer_view>
      vm.call @hal.semaphore.signal(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> ()
      %1 = vm.call @hal.semaphore.await(%ref_0, %c1) : (!vm.ref<!hal.semaphore>, i32) -> i32
      vm.cond_br %1, ^bb2(%1 : i32), ^bb3
    ^bb2(%2: i32):  // 2 preds: ^bb0, ^bb1
      vm.fail %2, "semaphore wait failed"
    ^bb3:  // pred: ^bb1
      vm.return %ref_4 : !vm.ref<!hal.buffer_view>
    }
    vm.export @conv$sync as("conv")
    vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i32) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %shape : i32 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i32, i32> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.match.id(%device : !vm.ref<!hal.device>, %pattern : !vm.ref<!iree.byte_buffer>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : i32, %executable_data : !vm.ref<!iree.byte_buffer>, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i32) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i32) -> i32 attributes {sym_visibility = "private"}
    vm.func @__init() {
      %ref = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %_utf8_vulkan_7197BF52A22CAFD7 = vm.const.ref.rodata @_utf8_vulkan_7197BF52A22CAFD7 : !vm.ref<!iree.byte_buffer>
      %0 = vm.call @hal.device.match.id(%ref, %_utf8_vulkan_7197BF52A22CAFD7) : (!vm.ref<!hal.device>, !vm.ref<!iree.byte_buffer>) -> i32
      vm.global.store.i32 %0, @_device_match_id_0 : i32
      %ref_0 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %c1 = vm.const.i32 1 : i32
      %zero = vm.const.i32.zero : i32
      %c7 = vm.const.i32 7 : i32
      %c2 = vm.const.i32 2 : i32
      %c6 = vm.const.i32 6 : i32
      %ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref_0, %c1, [(%zero, %c7, %c1), (%c1, %c7, %c1), (%c2, %c7, %c6)]) : (!vm.ref<!hal.device>, i32, tuple<i32, i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      vm.global.store.ref %ref_1, @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
      %_descriptor_set_layout_0 = vm.global.load.ref @_descriptor_set_layout_0 : !vm.ref<!hal.descriptor_set_layout>
      %ref_2 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %ref_3 = vm.call.variadic @hal.executable_layout.create(%ref_2, %zero, [%_descriptor_set_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
      vm.global.store.ref %ref_3, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %ref_4 = vm.call @hal.ex.shared_device() : () -> !vm.ref<!hal.device>
      %_device_match_id_0 = vm.global.load.i32 @_device_match_id_0 : i32
      vm.cond_br %_device_match_id_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %_conv_dispatch_0_vulkan_spirv_binary_spirv = vm.const.ref.rodata @_conv_dispatch_0_vulkan_spirv_binary_spirv : !vm.ref<!iree.byte_buffer>
      %c1397773893 = vm.const.i32 1397773893 : i32
      %ref_5 = vm.call.variadic @hal.executable.create(%ref_4, %c1397773893, %_conv_dispatch_0_vulkan_spirv_binary_spirv, [%_executable_layout_0]) : (!vm.ref<!hal.device>, i32, !vm.ref<!iree.byte_buffer>, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
      vm.br ^bb3(%ref_5 : !vm.ref<!hal.executable>)
    ^bb2:  // pred: ^bb0
      %null = vm.const.ref.zero : !vm.ref<!hal.executable>
      vm.br ^bb3(%null : !vm.ref<!hal.executable>)
    ^bb3(%1: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
      vm.global.store.ref %1, @_executable_conv_dispatch_0 : !vm.ref<!hal.executable>
      vm.return
    }
    vm.export @__init
  }
}